In [None]:
from pyspark import SparkContext

# Create a SparkContext
sc = SparkContext(appName="RDDExample")

# Create an RDD from a local data source (a list)
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Apply transformations and actions on the RDD
# Example transformations: map, filter
squared_rdd = rdd.map(lambda x: x**2)
filtered_rdd = squared_rdd.filter(lambda x: x > 10)

# Example actions: reduce, collect
sum_of_squares = squared_rdd.reduce(lambda x, y: x + y)
collected_data = filtered_rdd.collect()

# Analyze and manipulate data using RDD operations
print("Squared RDD:", squared_rdd.collect())
print("Filtered RDD:", collected_data)
print("Sum of squares:", sum_of_squares)

# Close the SparkContext
sc.stop()


In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("DataFrameExample").getOrCreate()

# Load a CSV file into a Spark DataFrame
df = spark.read.csv("path/to/your/csv/file.csv", header=True, inferSchema=True)

# Perform common DataFrame operations
# Example filtering
filtered_df = df.filter(df["age"] > 30)

# Example grouping and aggregation
grouped_df = df.groupBy("gender").agg({"salary": "avg"})

# Example joining
other_df = spark.read.csv("path/to/another/csv/file.csv", header=True, inferSchema=True)
joined_df = df.join(other_df, df["id"] == other_df["id"], "inner")

# Apply Spark SQL queries on the DataFrame
df.createOrReplaceTempView("people")
sql_query = "SELECT * FROM people WHERE age > 30"
sql_result = spark.sql(sql_query)

# Display the DataFrame and SQL results
df.show()
filtered_df.show()
grouped_df.show()
joined_df.show()
sql_result.show()

# Stop the SparkSession
spark.stop()


In [None]:
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

# Create a Spark Streaming Context
spark_streaming_context = StreamingContext(sparkContext, batchDuration)

# Configure the application to consume data from a streaming source (e.g., Kafka or a socket)
kafka_params = {
    'bootstrap.servers': 'localhost:9092',  # Replace with your Kafka broker addresses
    'group.id': 'your_consumer_group'  # Replace with your consumer group ID
}

# Create a DStream by consuming from a Kafka topic
kafka_topic = 'your_topic'  # Replace with the desired topic name
kafka_stream = KafkaUtils.createDirectStream(
    spark_streaming_context, [kafka_topic], kafka_params
)

# Implement streaming transformations and actions to process and analyze the incoming data stream
# Example streaming transformations: map, filter
processed_stream = kafka_stream.map(lambda message: message[1].upper()).filter(lambda word: word.startswith('A'))

# Example streaming actions: count
word_count = processed_stream.count()

# Print the results
word_count.pprint()

# Start the streaming context
spark_streaming_context.start()

# Wait for the streaming to finish
spark_streaming_context.awaitTermination()


In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQLExample").getOrCreate()

# Connect Spark with a relational database (e.g., MySQL, PostgreSQL)
jdbc_url = "jdbc:mysql://localhost:3306/your_database"  # Replace with your database connection URL
connection_properties = {
    "user": "your_username",  # Replace with your database username
    "password": "your_password",  # Replace with your database password
    "driver": "com.mysql.jdbc.Driver"  # Replace with the appropriate JDBC driver
}

# Read data from a database table using Spark SQL
table_name = "your_table"  # Replace with the name of your database table
df = spark.read.format("jdbc").options(
    url=jdbc_url,
    dbtable=table_name,
    **connection_properties
).load()

# Perform SQL operations on the data stored in the database using Spark SQL
df.createOrReplaceTempView("my_table")  # Create a temporary view for the DataFrame
sql_query = "SELECT * FROM my_table WHERE age > 30"  # Replace with your SQL query
sql_result = spark.sql(sql_query)

# Display the SQL result
sql_result.show()

# Explore Spark's integration capabilities with other data sources, such as HDFS or Amazon S3
hdfs_path = "hdfs://localhost:9000/your_hdfs_file"  # Replace with your HDFS file path
df_hdfs = spark.read.text(hdfs_path)
df_hdfs.show()

s3_path = "s3://your_bucket/your_s3_file"  # Replace with your Amazon S3 file path
df_s3 = spark.read.text(s3_path)
df_s3.show()

# Stop the SparkSession
spark.stop()
