1. Working with RDDs:
   a) Write a Python program to create an RDD from a local data source.
   b) Implement transformations and actions on the RDD to perform data processing tasks.
   c) Analyze and manipulate data using RDD operations such as map, filter, reduce, or aggregate.

2. Spark DataFrame Operations:
   a) Write a Python program to load a CSV file into a Spark DataFrame.
   b)Perform common DataFrame operations such as filtering, grouping, or joining.
   c) Apply Spark SQL queries on the DataFrame to extract insights from the data.

3. Spark Streaming:
  a) Write a Python program to create a Spark Streaming application.
   b) Configure the application to consume data from a streaming source (e.g., Kafka or a socket).
   c) Implement streaming transformations and actions to process and analyze the incoming data stream.

4. Spark SQL and Data Source Integration:
   a) Write a Python program to connect Spark with a relational database (e.g., MySQL, PostgreSQL).
   b)Perform SQL operations on the data stored in the database using Spark SQL.
   c) Explore the integration capabilities of Spark with other data sources, such as Hadoop Distributed File System (HDFS) or Amazon S3.


In [None]:
#1
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDDExample")

# Create an RDD from a local data source (list)
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Print the RDD elements
print(rdd.collect())

# Terminate the SparkContext
sc.stop()





from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDDTransformations")

# Create an RDD from a local data source (list)
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Apply transformations and actions
squared_rdd = rdd.map(lambda x: x ** 2)  # Squares each element
filtered_rdd = squared_rdd.filter(lambda x: x > 10)  # Filters elements greater than 10
sum_of_squared = filtered_rdd.reduce(lambda x, y: x + y)  # Calculates the sum of squared elements

# Print the results
print(squared_rdd.collect())  # [1, 4, 9, 16, 25]
print(filtered_rdd.collect())  # [16, 25]
print(sum_of_squared)  # 41

# Terminate the SparkContext
sc.stop()



from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDDOperations")

# Create an RDD from a local data source (list)
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Perform RDD operations
squared_rdd = rdd.map(lambda x: x ** 2)  # Square each element
filtered_rdd = squared_rdd.filter(lambda x: x > 10)  # Filter elements greater than 10
sum_of_squared = filtered_rdd.reduce(lambda x, y: x + y)  # Sum the filtered elements
average = filtered_rdd.aggregate((0, 0),  # Calculate average using aggregate
                                 lambda acc, value: (acc[0] + value, acc[1] + 1),
                                 lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))
average = average[0] / average[1]

# Print the results
print(squared_rdd.collect())  # [1, 4, 9, 16, 25]
print(filtered_rdd.collect())  # [16, 25]
print(sum_of_squared)  # 41
print(average)  # 20.5

# Terminate the SparkContext
sc.stop()


In [None]:
#2

from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrameExample").getOrCreate()

# Load a CSV file into a DataFrame
df = spark.read.csv("path/to/csv/file.csv", header=True, inferSchema=True)

# Show the DataFrame content
df.show()

# Terminate the SparkSession
spark.stop()




from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

# Load a CSV file into a DataFrame
df = spark.read.csv("path/to/csv/file.csv", header=True, inferSchema=True)

# Perform DataFrame operations
filtered_df = df.filter(df["age"] > 30)  # Filter rows where age is greater than 30
grouped_df = df.groupBy("gender").count()  # Group by gender and count occurrences
joined_df = df.join(other_df, "id", "inner")  # Inner join with another DataFrame on "id" column

# Show the results
filtered_df.show()
grouped_df.show()
joined_df.show()

# Terminate the SparkSession
spark.stop()





from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQLExample").getOrCreate()

# Load a CSV file into a DataFrame
df = spark.read.csv("path/to/csv/file.csv", header=True, inferSchema=True)

# Create a temporary view for the DataFrame
df.createOrReplaceTempView("people")

# Apply Spark SQL queries
result = spark.sql("SELECT * FROM people WHERE age > 30")

# Show the results
result.show()

# Terminate the SparkSession
spark.stop()


In [None]:
#3
from pyspark.streaming import StreamingContext
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local[2]", "StreamingExample")

# Create a StreamingContext with batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Configure the application to consume data from a streaming source (e.g., Kafka or a socket)
lines = ssc.socketTextStream("localhost", 9999)  # Example with socket stream, replace with your source

# Implement streaming transformations and actions to process and analyze the incoming data stream
word_counts = lines.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)

# Print the word counts
word_counts.pprint()

# Start the streaming computation
ssc.start()

# Wait for the computation to finish
ssc.awaitTermination()



from pyspark.streaming import StreamingContext
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local[2]", "StreamingTransformations")

# Create a StreamingContext with batch interval of 1 second
ssc = StreamingContext(sc, 1)

# Configure the application to consume data from a streaming source (e.g., Kafka or a socket)
lines = ssc.socketTextStream("localhost", 9999)  # Example with socket stream, replace with your source

# Implement streaming transformations and actions
filtered_lines = lines.filter(lambda line: "error" in line.lower())  # Filter lines containing "error"
word_counts = filtered_lines.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)

# Print the word counts
word_counts.pprint()

# Start the streaming computation
ssc.start()

# Wait for the computation to finish
ssc.awaitTermination()





In [None]:
#4
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQLDatabase").getOrCreate()

# Connect Spark with a relational database
jdbc_url = "jdbc:postgresql://localhost:5432/mydatabase"
db_properties = {
    "driver": "org.postgresql.Driver",
    "user": "username",
    "password": "password"
}

# Load data from a database table into a DataFrame
df = spark.read.jdbc(url=jdbc_url, table="mytable", properties=db_properties)

# Show the DataFrame content
df.show()

# Terminate the SparkSession
spark.stop()



from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQLDatabaseOperations").getOrCreate()

# Connect Spark with a relational database
jdbc_url = "jdbc:postgresql://localhost:5432/mydatabase"
db_properties = {
    "driver": "org.postgresql.Driver",
    "user": "username",
    "password": "password"
}

# Load data from a database table into a DataFrame
df = spark.read.jdbc(url=jdbc_url, table="mytable", properties=db_properties)

# Perform SQL operations on the DataFrame
result = spark.sql("SELECT * FROM mytable WHERE age > 30")

# Show the results
result.show()

# Terminate the SparkSession
spark.stop()
