<a href="https://colab.research.google.com/github/Suryan5h/Apache-Spark/blob/main/Spark_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SPARK SESSION

In [2]:
## Initializing PySpark

In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("PySpark Basics") \
    .getOrCreate()

print("Spark Session Created!")

Spark Session Created!


In [5]:
spark

In [6]:
## Creating a DataFrame
# Create a sample data list
data = [("Alice", 29), ("Bob", 35), ("Cathy", 22)]

# Define column names
columns = ["Name", "Age"]

# Create a DataFrame
df = spark.createDataFrame(data, schema=columns)

# Show the DataFrame
df.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 29|
|  Bob| 35|
|Cathy| 22|
+-----+---+



In [7]:
## Performing Transformations (Map and Filter)
# Create an RDD from a list
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])

# Apply a map transformation to square each number
mapped_rdd = rdd.map(lambda x: x ** 2)

# Filter numbers greater than 10
filtered_rdd = mapped_rdd.filter(lambda x: x > 10)

# Collect and print the results
print(filtered_rdd.collect())

[16, 25]


In [8]:
## Word Count Code
# Create an RDD from a list of sentences
sentences = ["hello world", "hello PySpark", "hello everyone"]
rdd = spark.sparkContext.parallelize(sentences)

# FlatMap: Split sentences into words
words_rdd = rdd.flatMap(lambda sentence: sentence.split(" "))

# Map: Assign a count of 1 to each word
word_pairs_rdd = words_rdd.map(lambda word: (word, 1))

# ReduceByKey: Sum counts for each word
word_count_rdd = word_pairs_rdd.reduceByKey(lambda a, b: a + b)

# Collect and display results
print("Word Counts:", word_count_rdd.collect())

Word Counts: [('world', 1), ('PySpark', 1), ('hello', 3), ('everyone', 1)]


In [10]:
words_rdd.collect()

['hello', 'world', 'hello', 'PySpark', 'hello', 'everyone']

In [11]:
word_pairs_rdd.collect()

[('hello', 1),
 ('world', 1),
 ('hello', 1),
 ('PySpark', 1),
 ('hello', 1),
 ('everyone', 1)]

In [12]:
word_count_rdd.collect()

[('world', 1), ('PySpark', 1), ('hello', 3), ('everyone', 1)]

In [13]:
## Partitioning and Repartitioning
# Create an RDD and check its number of partitions
rdd = spark.sparkContext.parallelize(range(1, 101), numSlices=4)
print("Initial Partitions:", rdd.getNumPartitions())

# Repartition to increase partitions
repartitioned_rdd = rdd.repartition(6)
print("After Repartition:", repartitioned_rdd.getNumPartitions())

Initial Partitions: 4
After Repartition: 6


In [14]:
## Using SQL Queries on DataFrames
# Sample data
data = [("Alice", 29), ("Bob", 35), ("Cathy", 22)]
columns = ["Name", "Age"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

# Execute SQL query
result = spark.sql("SELECT Name, Age FROM people WHERE Age > 30")

# Show the result
result.show()

+----+---+
|Name|Age|
+----+---+
| Bob| 35|
+----+---+

