1. Working with RDDs:
   a) Write a Python program to create an RDD from a local data source.




In [None]:
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDD Creation Example")

# Path to the local data source
data_file = "path/to/your/local/data.txt"

# Create an RDD from a local data source
rdd = sc.textFile(data_file)

# Perform transformations and actions on the RDD
# For example, print the contents of the RDD
for line in rdd.collect():
    print(line)

# Stop the SparkContext
sc.stop()


   b) Implement transformations and actions on the RDD to perform data processing tasks.
  


In [None]:
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDD Processing Example")

# Path to the local data source
data_file = "path/to/your/local/data.txt"

# Create an RDD from a local data source
rdd = sc.textFile(data_file)

# Transformation: Map
mapped_rdd = rdd.map(lambda line: line.upper())

# Transformation: Filter
filtered_rdd = rdd.filter(lambda line: 'Spark' in line)

# Transformation: FlatMap
words_rdd = rdd.flatMap(lambda line: line.split())

# Action: Collect
all_lines = rdd.collect()

# Action: Count
line_count = rdd.count()

# Action: Take
sample_lines = rdd.take(5)

# Print the transformed RDDs and computed results
print("Mapped RDD:")
for line in mapped_rdd.collect():
    print(line)

print("Filtered RDD:")
for line in filtered_rdd.collect():
    print(line)

print("Words RDD:")
for word in words_rdd.collect():
    print(word)

print("All Lines:")
for line in all_lines:
    print(line)

print("Line Count:", line_count)

print("Sample Lines:")
for line in sample_lines:
    print(line)

# Stop the SparkContext
sc.stop()


 c) Analyze and manipulate data using RDD operations such as map, filter, reduce, or aggregate.

In [None]:
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext("local", "RDD Operations Example")

# Path to the local data source
data_file = "path/to/your/local/data.txt"

# Create an RDD from a local data source
rdd = sc.textFile(data_file)

# Transformation: Map
mapped_rdd = rdd.map(lambda line: line.split(','))

# Transformation: Filter
filtered_rdd = mapped_rdd.filter(lambda record: int(record[2]) > 30)

# Transformation: Reduce
total_age = filtered_rdd.map(lambda record: int(record[2])).reduce(lambda a, b: a + b)

# Transformation: Aggregate
avg_age_count = filtered_rdd.aggregate(
    (0, 0),
    lambda acc, record: (acc[0] + int(record[2]), acc[1] + 1),
    lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1])
)
avg_age = avg_age_count[0] / avg_age_count[1]

# Print the results
print("Total Age:", total_age)
print("Average Age:", avg_age)

# Stop the SparkContext
sc.stop()


######################################################

2. Spark DataFrame Operations:
   a) Write a Python program to load a CSV file into a Spark DataFrame.
  

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Load CSV into DataFrame").getOrCreate()

# Path to the CSV file
csv_file = "path/to/your/csv/file.csv"

# Load CSV into DataFrame
df = spark.read.format("csv").option("header", "true").load(csv_file)

# Display the DataFrame
df.show()

# Stop the SparkSession
spark.stop()

 b)Perform common DataFrame operations such as filtering, grouping, or joining.


In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrame Operations").getOrCreate()

# Path to the CSV file
csv_file = "path/to/your/csv/file.csv"

# Load CSV into DataFrame
df = spark.read.format("csv").option("header", "true").load(csv_file)

# Filter rows based on a condition
filtered_df = df.filter(df["Age"] > 30)

# Group by a column and calculate the average of another column
grouped_df = df.groupBy("Country").avg("Salary")

# Join two DataFrames
another_csv_file = "path/to/another/csv/file.csv"
another_df = spark.read.format("csv").option("header", "true").load(another_csv_file)
joined_df = df.join(another_df, on="ID", how="inner")

# Display the DataFrames
print("Filtered DataFrame:")
filtered_df.show()

print("Grouped DataFrame:")
grouped_df.show()

print("Joined DataFrame:")
joined_df.show()

# Stop the SparkSession
spark.stop()


  c) Apply Spark SQL queries on the DataFrame to extract insights from the data.



In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark SQL Queries").getOrCreate()

# Path to the CSV file
csv_file = "path/to/your/csv/file.csv"

# Load CSV into DataFrame
df = spark.read.format("csv").option("header", "true").load(csv_file)

# Create a temporary view from the DataFrame
df.createOrReplaceTempView("my_table")

# Perform Spark SQL queries
query1 = "SELECT COUNT(*) AS TotalCount FROM my_table"
query2 = "SELECT Country, AVG(Salary) AS AverageSalary FROM my_table GROUP BY Country"
query3 = "SELECT * FROM my_table WHERE Age > 30"

# Execute the queries and retrieve the results as DataFrames
result1 = spark.sql(query1)
result2 = spark.sql(query2)
result3 = spark.sql(query3)

# Display the results
print("Query 1: Total Count")
result1.show()

print("Query 2: Average Salary by Country")
result2.show()

print("Query 3: Records where Age > 30")
result3.show()

# Stop the SparkSession
spark.stop()


###################################################

3. Spark Streaming:
  a) Write a Python program to create a Spark Streaming application.
  


In [None]:

from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext

# Create a SparkSession
spark = SparkSession.builder.appName("Spark Streaming Application").getOrCreate()

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(spark.sparkContext, 1)

# Set the log level to only display errors
spark.sparkContext.setLogLevel("ERROR")

# Create a DStream by connecting to a TCP socket
host = "localhost"
port = 9999
lines = ssc.socketTextStream(host, port)

# Process the incoming data stream
word_counts = lines.flatMap(lambda line: line.split(" ")) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(lambda a, b: a + b)

# Print the word counts
word_counts.pprint()

# Start the streaming computation
ssc.start()

# Wait for the streaming to finish
ssc.awaitTermination()


 b) Configure the application to consume data from a streaming source (e.g., Kafka or a socket).
  


In [None]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

# Create a SparkSession
spark = SparkSession.builder.appName("Spark Streaming Application").getOrCreate()

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(spark.sparkContext, 1)

# Set the log level to only display errors
spark.sparkContext.setLogLevel("ERROR")

# Kafka configuration
kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "subscribe": "my_topic"
}

# Create a DStream by consuming data from Kafka
kafka_stream = KafkaUtils.createDirectStream(ssc, topics=[kafka_params["subscribe"]], kafkaParams=kafka_params)

# Extract the data from Kafka messages
lines = kafka_stream.map(lambda x: x[1])

# Process the incoming data stream
word_counts = lines.flatMap(lambda line: line.split(" ")) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(lambda a, b: a + b)

# Print the word counts
word_counts.pprint()

# Start the streaming computation
ssc.start()

# Wait for the streaming to finish
ssc.awaitTermination()


 c) Implement streaming transformations and actions to process and analyze the incoming data stream.

In [None]:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

# Create a SparkSession
spark = SparkSession.builder.appName("Streaming Transformations and Actions").getOrCreate()

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(spark.sparkContext, 1)

# Set the log level to only display errors
spark.sparkContext.setLogLevel("ERROR")

# Kafka configuration
kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "subscribe": "my_topic"
}

# Create a DStream by consuming data from Kafka
kafka_stream = KafkaUtils.createDirectStream(ssc, topics=[kafka_params["subscribe"]], kafkaParams=kafka_params)

# Extract the data from Kafka messages
lines = kafka_stream.map(lambda x: x[1])

# Perform streaming transformations and actions
word_counts = lines.flatMap(lambda line: line.split(" ")) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(lambda a, b: a + b)

# Process each RDD in the stream
def process_rdd(time, rdd):
    if not rdd.isEmpty():
        print("RDD processing time:", time)
        print("Word counts:")
        for word, count in rdd.collect():
            print(f"{word}: {count}")
        print()

# Apply the process_rdd function to each RDD in the DStream
word_counts.foreachRDD(process_rdd)

# Start the streaming computation
ssc.start()

# Wait for the streaming to finish
ssc.awaitTermination()


###############################################

4. Spark SQL and Data Source Integration:
   a) Write a Python program to connect Spark with a relational database (e.g., MySQL, PostgreSQL).


In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark Database Connection").getOrCreate()

# Database connection properties
db_url = "jdbc:mysql://localhost:3306/mydatabase"
db_table = "mytable"
db_user = "myuser"
db_password = "mypassword"

# Read data from the database table
df = spark.read.format("jdbc") \
    .option("url", db_url) \
    .option("dbtable", db_table) \
    .option("user", db_user) \
    .option("password", db_password) \
    .load()

# Perform operations on the DataFrame
# For example, display the contents of the DataFrame
df.show()

# Stop the SparkSession
spark.stop()


  b)Perform SQL operations on the data stored in the database using Spark SQL.
   

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark SQL Operations").getOrCreate()

# Database connection properties
db_url = "jdbc:mysql://localhost:3306/mydatabase"
db_table = "mytable"
db_user = "myuser"
db_password = "mypassword"

# Register the database table as a temporary view
spark.read.format("jdbc") \
    .option("url", db_url) \
    .option("dbtable", db_table) \
    .option("user", db_user) \
    .option("password", db_password) \
    .load() \
    .createOrReplaceTempView("my_table")

# Perform SQL operations on the data
query1 = "SELECT * FROM my_table WHERE age > 30"
query2 = "SELECT COUNT(*) AS total_count FROM my_table"
query3 = "SELECT country, AVG(salary) AS average_salary FROM my_table GROUP BY country"

# Execute the SQL queries and retrieve the results as DataFrames
result1 = spark.sql(query1)
result2 = spark.sql(query2)
result3 = spark.sql(query3)

# Display the results
print("Query 1: Filtered Data")
result1.show()

print("Query 2: Total Count")
result2.show()

print("Query 3: Average Salary by Country")
result3.show()

# Stop the SparkSession
spark.stop()


c) Explore the integration capabilities of Spark with other data sources, such as Hadoop Distributed File System (HDFS) or Amazon S3.

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Spark Data Integration").getOrCreate()

# Read data from HDFS
hdfs_file = "hdfs://localhost:9000/path/to/data.parquet"
df = spark.read.format("parquet").load(hdfs_file)

# Write data to Amazon S3
s3_bucket = "my-s3-bucket"
s3_path = "s3a://my-s3-bucket/path/to/data.parquet"
df.write.format("parquet").save(s3_path)

# Stop the SparkSession
spark.stop()
