# 1. PySpark Setup & Initialization

Exercise 1.1 – Setup Spark:

    Initialize SparkSession with:

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName("BotCampus Intermediate Session") \
  .master("local[*]") \
  .getOrCreate()

Exercise 1.2 – Load starter data:

In [2]:
data = [("Ananya", "Bangalore", 24),
  ("Ravi", "Hyderabad", 28),
  ("Kavya", "Delhi", 22),
  ("Meena", "Chennai", 25)
]
columns = ["name", "city", "age"]

df = spark.createDataFrame(data, columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



# 2. RDDs & Transformations

Exercise 2.1 – Create RDD from feedback:

In [3]:
feedback = spark.sparkContext.parallelize([
  "Ravi from Bangalore loved the mobile app",
  "Meena from Delhi reported poor response time",
  "Ajay from Pune liked the delivery speed",
  "Ananya from Hyderabad had an issue with UI",
  "Rohit from Mumbai gave positive feedback"
])

Tasks:



    Count total number of words.

    Find top 3 most common words.

    Remove stop words ( from , with , the , etc.).

    Create a dictionary of word → count.



In [4]:
# Count total number of words.
total_words = feedback.flatMap(lambda x: x.split()).count()
print("Total number of words:", total_words)

Total number of words: 35


In [5]:
# Find top 3 most common words.
from operator import add
word_counts = feedback.flatMap(lambda x: x.lower().split()) \
                      .map(lambda word: (word, 1)) \
                      .reduceByKey(add)

top_3 = word_counts.takeOrdered(3, key=lambda x: -x[1])
print("Top 3 words:", top_3)

Top 3 words: [('from', 5), ('the', 2), ('loved', 1)]


In [6]:
# Remove stop words ( from , with , the , etc.).
stop_words = {"from", "with", "the", "an", "had", "of"}
filtered_words = word_counts.filter(lambda x: x[0] not in stop_words)
print("Filtered word counts:", filtered_words.collect())

Filtered word counts: [('loved', 1), ('app', 1), ('poor', 1), ('response', 1), ('liked', 1), ('speed', 1), ('ananya', 1), ('issue', 1), ('rohit', 1), ('mumbai', 1), ('positive', 1), ('feedback', 1), ('ravi', 1), ('bangalore', 1), ('mobile', 1), ('meena', 1), ('delhi', 1), ('reported', 1), ('time', 1), ('ajay', 1), ('pune', 1), ('delivery', 1), ('hyderabad', 1), ('ui', 1), ('gave', 1)]


In [7]:
# Create a dictionary of word → count.
word_dict = dict(filtered_words.collect())
print("Word count dictionary:", word_dict)

Word count dictionary: {'loved': 1, 'app': 1, 'poor': 1, 'response': 1, 'liked': 1, 'speed': 1, 'ananya': 1, 'issue': 1, 'rohit': 1, 'mumbai': 1, 'positive': 1, 'feedback': 1, 'ravi': 1, 'bangalore': 1, 'mobile': 1, 'meena': 1, 'delhi': 1, 'reported': 1, 'time': 1, 'ajay': 1, 'pune': 1, 'delivery': 1, 'hyderabad': 1, 'ui': 1, 'gave': 1}


# 3. DataFrames – Transformations

Exercise 3.1 – Create exam_scores DataFrame:

In [8]:
scores = [
  ("Ravi", "Math", 88),
  ("Ananya", "Science", 92),
  ("Kavya", "English", 79),
  ("Ravi", "English", 67),
  ("Neha", "Math", 94),
  ("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)

Tasks:

    Add grade column ( >=90 → A, 80-89 → B, 70-79 → C, else D).

    Group by subject, find average score.

    Use when and otherwise to classify subject difficulty ( Math/Science =
    Difficult).

    Rank students per subject using Window function.

    Apply UDF to format names (e.g., make all uppercase).

In [9]:
# Add grade column ( >=90 → A, 80-89 → B, 70-79 → C, else D).
from pyspark.sql.functions import when

df_scores = df_scores.withColumn(
    "grade",
    when(df_scores.score >= 90, "A")
    .when(df_scores.score >= 80, "B")
    .when(df_scores.score >= 70, "C")
    .otherwise("D")
)
df_scores.show()

+------+-------+-----+-----+
|  name|subject|score|grade|
+------+-------+-----+-----+
|  Ravi|   Math|   88|    B|
|Ananya|Science|   92|    A|
| Kavya|English|   79|    C|
|  Ravi|English|   67|    D|
|  Neha|   Math|   94|    A|
| Meena|Science|   85|    B|
+------+-------+-----+-----+



In [10]:
# Group by subject, find average score.
df_scores.groupBy("subject").avg("score").show()

+-------+----------+
|subject|avg(score)|
+-------+----------+
|Science|      88.5|
|   Math|      91.0|
|English|      73.0|
+-------+----------+



In [11]:
# Use when and otherwise to classify subject difficulty ( Math/Science = Difficult).
df_scores = df_scores.withColumn(
    "difficulty",
    when(df_scores.subject.isin("Math", "Science"), "Difficult").otherwise("Easy")
)
df_scores.show()

+------+-------+-----+-----+----------+
|  name|subject|score|grade|difficulty|
+------+-------+-----+-----+----------+
|  Ravi|   Math|   88|    B| Difficult|
|Ananya|Science|   92|    A| Difficult|
| Kavya|English|   79|    C|      Easy|
|  Ravi|English|   67|    D|      Easy|
|  Neha|   Math|   94|    A| Difficult|
| Meena|Science|   85|    B| Difficult|
+------+-------+-----+-----+----------+



In [12]:
# Rank students per subject using Window function.
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

windowSpec = Window.partitionBy("subject").orderBy(df_scores.score.desc())

df_scores = df_scores.withColumn("rank", rank().over(windowSpec))
df_scores.show()

+------+-------+-----+-----+----------+----+
|  name|subject|score|grade|difficulty|rank|
+------+-------+-----+-----+----------+----+
| Kavya|English|   79|    C|      Easy|   1|
|  Ravi|English|   67|    D|      Easy|   2|
|  Neha|   Math|   94|    A| Difficult|   1|
|  Ravi|   Math|   88|    B| Difficult|   2|
|Ananya|Science|   92|    A| Difficult|   1|
| Meena|Science|   85|    B| Difficult|   2|
+------+-------+-----+-----+----------+----+



In [13]:
# Apply UDF to format names (e.g., make all uppercase).
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def upper_case(name):
    return name.upper()

upper_udf = udf(upper_case, StringType())

df_scores = df_scores.withColumn("name_upper", upper_udf(df_scores.name))
df_scores.show()

+------+-------+-----+-----+----------+----+----------+
|  name|subject|score|grade|difficulty|rank|name_upper|
+------+-------+-----+-----+----------+----+----------+
| Kavya|English|   79|    C|      Easy|   1|     KAVYA|
|  Ravi|English|   67|    D|      Easy|   2|      RAVI|
|  Neha|   Math|   94|    A| Difficult|   1|      NEHA|
|  Ravi|   Math|   88|    B| Difficult|   2|      RAVI|
|Ananya|Science|   92|    A| Difficult|   1|    ANANYA|
| Meena|Science|   85|    B| Difficult|   2|     MEENA|
+------+-------+-----+-----+----------+----+----------+



# 4. Ingest CSV & JSON – Save to Parquet

Dataset 1: CSV file: students.csv
Dataset 2: JSON file employee_nested.json

Tasks:

    Load both datasets into PySpark.

    Print schema and infer nested structure.

    Flatten the JSON (use explode , select , alias ).

    Convert both to Parquet and write to /tmp/output .

In [15]:
from google.colab import files
uploaded = files.upload()

Saving employee_nested.json to employee_nested.json
Saving students.csv to students.csv


In [17]:
# Load both datasets into PySpark.
# Print schema and infer nested structure.

# Load CSV
students_df = spark.read.csv("students.csv", header=True, inferSchema=True)
students_df.printSchema()
students_df.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: integer (nullable = true)

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+



In [19]:
# Load JSON
# Print schema and infer nested structure.
emp_df = spark.read.json("employee_nested.json", multiLine=True)
emp_df.printSchema()
emp_df.show(truncate=False)

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----------------+---+-----+---------------+
|address         |id |name |skills         |
+----------------+---+-----+---------------+
|{Mumbai, 400001}|101|Sneha|[Python, Spark]|
+----------------+---+-----+---------------+



In [21]:
# Flatten the JSON (use explode , select , alias ).
from pyspark.sql.functions import explode, col

flattened = emp_df.select(
    "id",
    "name",
    col("address.city").alias("City"),
    col("address.pincode").alias("Pin-code"),
    explode("skills").alias("Skill")
)
flattened.show()

+---+-----+------+--------+------+
| id| name|  City|Pin-code| Skill|
+---+-----+------+--------+------+
|101|Sneha|Mumbai|  400001|Python|
|101|Sneha|Mumbai|  400001| Spark|
+---+-----+------+--------+------+



In [22]:
# Convert both to Parquet and write to /tmp/output .
students_df.write.mode("overwrite").parquet("/tmp/output/students")
flattened.write.mode("overwrite").parquet("/tmp/output/employees")

In [24]:
# Read students
students_parquet = spark.read.parquet("/tmp/output/students")
students_parquet.show()

# Read employees
employees_parquet = spark.read.parquet("/tmp/output/employees")
employees_parquet.show()

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+

+---+-----+------+--------+------+
| id| name|  City|Pin-code| Skill|
+---+-----+------+--------+------+
|101|Sneha|Mumbai|  400001|Python|
|101|Sneha|Mumbai|  400001| Spark|
+---+-----+------+--------+------+



# 5. Spark SQL – Temp Views & Queries

Exercise 5.1 Create view from exam scores and run:

    a) Top scorer per subject
    b) Count of students per grade
    c) Students with multiple subjects
    d) Subjects with average score above 85

In [25]:
# Create view from exam scores
df_scores.createOrReplaceTempView("exam_scores")

In [27]:
# a) Top scorer per subject
spark.sql("""
SELECT subject, name, MAX(score) as max_score
FROM exam_scores
GROUP BY subject, name
ORDER BY max_score DESC
""").show()

+-------+------+---------+
|subject|  name|max_score|
+-------+------+---------+
|   Math|  Neha|       94|
|Science|Ananya|       92|
|   Math|  Ravi|       88|
|Science| Meena|       85|
|English| Kavya|       79|
|English|  Ravi|       67|
+-------+------+---------+



In [28]:
# b) Count of students per grade
spark.sql("""
SELECT grade, COUNT(*) as student_count
FROM exam_scores
GROUP BY grade
""").show()

+-----+-------------+
|grade|student_count|
+-----+-------------+
|    B|            2|
|    C|            1|
|    A|            2|
|    D|            1|
+-----+-------------+



In [29]:
# c) Students with multiple subjects
spark.sql("""
SELECT name, COUNT(DISTINCT subject) as subject_count
FROM exam_scores
GROUP BY name
HAVING subject_count > 1
""").show()

+----+-------------+
|name|subject_count|
+----+-------------+
|Ravi|            2|
+----+-------------+



In [30]:
# d) Subjects with average score above 85
spark.sql("""
SELECT subject, AVG(score) as avg_score
FROM exam_scores
GROUP BY subject
HAVING avg_score > 85
""").show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
+-------+---------+



Exercise 5.2 Create another DataFrame attendance(name, days_present) and:

    Join with scores
    
    Calculate attendance-adjusted grade:
    If days_present < 20 → downgrade grade by one level

In [31]:
# Create another DataFrame attendance(name, days_present)
attendance = [("Ravi", 18), ("Ananya", 22), ("Kavya", 25), ("Neha", 15), ("Meena", 20)]
attendance_df = spark.createDataFrame(attendance, ["name", "days_present"])

In [32]:
# Join with scores
joined_df = df_scores.join(attendance_df, "name")

In [33]:
# Calculate attendance-adjusted grade:
# If days_present < 20 → downgrade grade by one level

# Downgrade logic
from pyspark.sql.functions import expr

joined_df = joined_df.withColumn(
    "adjusted_grade",
    when(col("days_present") < 20,
         expr("""CASE grade
                 WHEN 'A' THEN 'B'
                 WHEN 'B' THEN 'C'
                 WHEN 'C' THEN 'D'
                 ELSE 'D' END"""))
    .otherwise(col("grade"))
)

joined_df.select("name", "subject", "score", "grade", "days_present", "adjusted_grade").show()

+------+-------+-----+-----+------------+--------------+
|  name|subject|score|grade|days_present|adjusted_grade|
+------+-------+-----+-----+------------+--------------+
|Ananya|Science|   92|    A|          22|             A|
| Kavya|English|   79|    C|          25|             C|
| Meena|Science|   85|    B|          20|             B|
|  Neha|   Math|   94|    A|          15|             B|
|  Ravi|   Math|   88|    B|          18|             C|
|  Ravi|English|   67|    D|          18|             D|
+------+-------+-----+-----+------------+--------------+



# 6. Partitioned Load (Full + Incremental)

In [34]:
# Initial Load:
df_scores.write.partitionBy("subject").parquet("/tmp/scores/", mode="overwrite")

In [35]:
# Incremental Load:
incremental = [("Meena", "Math", 93)]
df_inc = spark.createDataFrame(incremental, columns)
df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

Task:

    List all folders inside /tmp/scores/

    Read only Math partition and display all entries.

In [39]:
# List all folders inside /tmp/scores/
!ls /tmp/scores/

'subject=English'  'subject=Math'  'subject=Science'   _SUCCESS


In [40]:
# Read only Math partition and display all entries.
math_df = spark.read.parquet("/tmp/scores/subject=Math")
math_df.show()

+-----+-----+-----+----------+----+----------+
| name|score|grade|difficulty|rank|name_upper|
+-----+-----+-----+----------+----+----------+
| Neha|   94|    A| Difficult|   1|      NEHA|
| Ravi|   88|    B| Difficult|   2|      RAVI|
|Meena|   93| NULL|      NULL|NULL|      NULL|
+-----+-----+-----+----------+----+----------+



# 7. ETL: Clean, Transform, Load

In [42]:
from google.colab import files
uploaded = files.upload()

Saving employee_raw.csv to employee_raw.csv


Tasks:

    Load data with header.

    Fill missing bonus with 2000.

    Calculate total_ctc = salary + bonus .

    Filter where total_ctc > 60,000.

    Save final DataFrame to Parquet and JSON.

In [45]:
# Load data with header.
raw_df = spark.read.csv("employee_raw.csv", header=True, inferSchema=True)
raw_df.show()

+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| NULL|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [46]:
# Fill missing bonus with 2000.
from pyspark.sql.functions import coalesce, lit

clean_df = raw_df.withColumn("bonus", coalesce(col("bonus"), lit(2000)))
clean_df.show()

+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| 2000|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [47]:
# Calculate total_ctc = salary + bonus .
clean_df = clean_df.withColumn("total_ctc", col("salary") + col("bonus"))
clean_df.show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 78000| 5000|    83000|
|     2|Kavya|     HR| 62000| 2000|    64000|
|     3|Sneha|Finance| 55000| 3000|    58000|
+------+-----+-------+------+-----+---------+



In [48]:
# Filter where total_ctc > 60,000.
filtered_df = clean_df.filter(col("total_ctc") > 60000)
filtered_df.show()

+------+-----+----+------+-----+---------+
|emp_id| name|dept|salary|bonus|total_ctc|
+------+-----+----+------+-----+---------+
|     1|Arjun|  IT| 78000| 5000|    83000|
|     2|Kavya|  HR| 62000| 2000|    64000|
+------+-----+----+------+-----+---------+



In [49]:
# Save final DataFrame to Parquet and JSON.
filtered_df.write.mode("overwrite").parquet("/tmp/clean_employees/parquet")
filtered_df.write.mode("overwrite").json("/tmp/clean_employees/json")

In [54]:
# Read and Show Parquet Files
parquet_df = spark.read.parquet("/tmp/clean_employees/parquet")
parquet_df.toPandas()

Unnamed: 0,emp_id,name,dept,salary,bonus,total_ctc
0,1,Arjun,IT,78000,5000,83000
1,2,Kavya,HR,62000,2000,64000


In [57]:
import os
os.listdir("/tmp/clean_employees/json")

['._SUCCESS.crc',
 '.part-00000-c66136e0-f048-4851-a269-044d3812f61e-c000.json.crc',
 '_SUCCESS',
 'part-00000-c66136e0-f048-4851-a269-044d3812f61e-c000.json']

In [59]:
# Download JSON file
import os
from google.colab import files

# Rename the file
os.rename("/tmp/clean_employees/json/part-00000-c66136e0-f048-4851-a269-044d3812f61e-c000.json",
          "/tmp/cleaned_employees.json")

# Download the renamed file
files.download("/tmp/cleaned_employees.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>