In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PySpark_Excercise1") \
    .master("local[*]") \
    .getOrCreate()


In [3]:
data = [
    ("Ananya", "Bangalore", 24),
    ("Ravi", "Hyderabad", 28),
    ("Kavya", "Delhi", 22),
    ("Meena", "Chennai", 25)
]
columns = ["name", "city", "age"]

df = spark.createDataFrame(data, columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



# RDDs & Transformations

In [4]:

feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the mobile app",
    "Meena from Delhi reported poor response time",
    "Ajay from Pune liked the delivery speed",
    "Ananya from Hyderabad had an issue with UI",
    "Rohit from Mumbai gave positive feedback"
])

 Tasks:

 Count total number of words.

 Find top 3 most common words.

 Remove stop words (from, with, the,etc.).

 Create a dictionary of word → count.

In [6]:
# 2.1 Count total words
word_count = feedback.flatMap(lambda x: x.split(" ")).count()
print("Total Words:", word_count)

Total Words: 35


In [10]:
# 2.2 Find top 3 most common words
from collections import Counter

# Flatten and collect words into a Python list
words_list = feedback.flatMap(lambda x: x.lower().split()).collect()

# Count top 3
top3 = Counter(words_list).most_common(3)
print("Top 3 Words:", top3)


Top 3 Words: [('from', 5), ('the', 2), ('ravi', 1)]


In [8]:
# 2.3 Remove stop words
stop_words = {"the", "and", "with", "from", "an", "had", "was", "is", "to", "a", "of", "in"}
filtered_words = feedback.flatMap(lambda x: [w for w in x.lower().split() if w not in stop_words])
print("Filtered Words:", filtered_words.collect())

Filtered Words: ['ravi', 'bangalore', 'loved', 'mobile', 'app', 'meena', 'delhi', 'reported', 'poor', 'response', 'time', 'ajay', 'pune', 'liked', 'delivery', 'speed', 'ananya', 'hyderabad', 'issue', 'ui', 'rohit', 'mumbai', 'gave', 'positive', 'feedback']


In [9]:
# 2.4 Dictionary of word → count
word_dict = dict(Counter(filtered_words.collect()))
print("Word Dictionary:", word_dict)

Word Dictionary: {'ravi': 1, 'bangalore': 1, 'loved': 1, 'mobile': 1, 'app': 1, 'meena': 1, 'delhi': 1, 'reported': 1, 'poor': 1, 'response': 1, 'time': 1, 'ajay': 1, 'pune': 1, 'liked': 1, 'delivery': 1, 'speed': 1, 'ananya': 1, 'hyderabad': 1, 'issue': 1, 'ui': 1, 'rohit': 1, 'mumbai': 1, 'gave': 1, 'positive': 1, 'feedback': 1}


# DataFrames – Transformations

In [11]:
from pyspark.sql.functions import when, col, upper
from pyspark.sql import Window
import pyspark.sql.functions as F

scores = [
    ("Ravi", "Math", 88),
    ("Ananya", "Science", 92),
    ("Kavya", "English", 79),
    ("Ravi", "English", 67),
    ("Neha", "Math", 94),
    ("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)

Tasks:
 Add grade column (>=90 → A, 80-89 → B,
70-79 → C, else D).

 Group by subject, find average score.

 Use when and otherwise to classify subject difficulty (Difficult).

 Rank students per subject using Window function.

 Apply UDF to format names (e.g., make all uppercase).

In [12]:
# 3.1 Add Grade Column
df_scores = df_scores.withColumn(
    "grade",
    when(col("score") >= 90, "A")
    .when((col("score") >= 80) & (col("score") <= 89), "B")
    .when((col("score") >= 70) & (col("score") <= 79), "C")
    .otherwise("D")
)

In [13]:
# 3.2 Group by subject - average score
df_scores.groupBy("subject").agg(F.avg("score").alias("avg_score")).show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
|English|     73.0|
+-------+---------+



In [14]:
# 3.3 Classify difficulty
df_scores = df_scores.withColumn(
    "difficulty",
    when(col("subject").isin("Math", "Science"), "Difficult").otherwise("Easy")
)

In [15]:
# 3.4 Rank students per subject
windowSpec = Window.partitionBy("subject").orderBy(col("score").desc())
df_scores = df_scores.withColumn("rank", F.rank().over(windowSpec))

In [16]:
# 3.5 Apply UDF to format names
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

upper_udf = udf(lambda x: x.upper(), StringType())
df_scores = df_scores.withColumn("name_upper", upper_udf(col("name")))

df_scores.show()

+------+-------+-----+-----+----------+----+----------+
|  name|subject|score|grade|difficulty|rank|name_upper|
+------+-------+-----+-----+----------+----+----------+
| Kavya|English|   79|    C|      Easy|   1|     KAVYA|
|  Ravi|English|   67|    D|      Easy|   2|      RAVI|
|  Neha|   Math|   94|    A| Difficult|   1|      NEHA|
|  Ravi|   Math|   88|    B| Difficult|   2|      RAVI|
|Ananya|Science|   92|    A| Difficult|   1|    ANANYA|
| Meena|Science|   85|    B| Difficult|   2|     MEENA|
+------+-------+-----+-----+----------+----+----------+



# Ingest CSV & JSON – Save to Parquet

In [18]:
import pandas as pd


csv_data = """id,name,department,city,salary
1,Amit,IT,Bangalore,78000
2,Kavya,HR,Chennai,62000
3,Arjun,Finance,Hyderabad,55000
"""
with open("/content/students.csv", "w") as f:
    f.write(csv_data)


In [19]:
json_data = [
    {
        "id": 101,
        "name": "Sneha",
        "address": {"city": "Mumbai", "pincode": 400001},
        "skills": ["Python", "Spark"]
    }
]
import json
with open("/content/employee_nested.json", "w") as f:
    json.dump(json_data, f)

 Tasks:

 Load both datasets into PySpark.

 Print schema and infer nested structure.

 Flatten the JSON (use explode, select ,
alias ).

 Convert both to Parquet and write to
/tmp/output .

In [21]:
# Load CSV
df_csv = spark.read.csv("/content/students.csv", header=True, inferSchema=True)
df_csv.show()

# Load JSON
df_json = spark.read.json("/content/employee_nested.json", multiLine=True)
df_json.printSchema()

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [22]:
df_flat = df_json.select(
    "id",
    "name",
    col("address.city").alias("city"),
    col("address.pincode").alias("pincode"),
    F.explode("skills").alias("skill")
)
df_flat.show()

+---+-----+------+-------+------+
| id| name|  city|pincode| skill|
+---+-----+------+-------+------+
|101|Sneha|Mumbai| 400001|Python|
|101|Sneha|Mumbai| 400001| Spark|
+---+-----+------+-------+------+



In [23]:
df_csv.write.mode("overwrite").parquet("/tmp/output/students")
df_flat.write.mode("overwrite").parquet("/tmp/output/employees")

# Spark SQL – Temp Views & Queries

 Exercise 5.1 Create view from exam scores and run:

 -- a) Top scorer per subject

 --b) Count of students per grade

 -- c) Students with multiple subjects

 -- d) Subjects with average score above 85

In [24]:
df_scores.createOrReplaceTempView("exam_scores")


In [25]:
spark.sql("""
SELECT subject, name, MAX(score) AS top_score
FROM exam_scores
GROUP BY subject, name
ORDER BY subject, top_score DESC
""").show()


+-------+------+---------+
|subject|  name|top_score|
+-------+------+---------+
|English| Kavya|       79|
|English|  Ravi|       67|
|   Math|  Neha|       94|
|   Math|  Ravi|       88|
|Science|Ananya|       92|
|Science| Meena|       85|
+-------+------+---------+



In [26]:
spark.sql("""
SELECT grade, COUNT(*) AS student_count
FROM exam_scores
GROUP BY grade
""").show()


+-----+-------------+
|grade|student_count|
+-----+-------------+
|    B|            2|
|    C|            1|
|    A|            2|
|    D|            1|
+-----+-------------+



In [27]:
spark.sql("""
SELECT name, COUNT(subject) AS subject_count
FROM exam_scores
GROUP BY name
HAVING subject_count > 1
""").show()


+----+-------------+
|name|subject_count|
+----+-------------+
|Ravi|            2|
+----+-------------+



In [28]:
spark.sql("""
SELECT subject, AVG(score) AS avg_score
FROM exam_scores
GROUP BY subject
HAVING avg_score > 85
""").show()


+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
+-------+---------+



 Exercise 5.2 Create another DataFrame
attendance(name, days_present) and:

Join with scores

 Calculate attendance-adjusted grade:
 If days_present < 20 → downgrade grade by one level

In [29]:
# New DataFrame for attendance
attendance_data = [("Ravi", 22), ("Ananya", 18), ("Kavya", 25), ("Neha", 21), ("Meena", 19)]
columns = ["name", "days_present"]
df_attendance = spark.createDataFrame(attendance_data, columns)

In [35]:
df_joined = df_scores.join(df_attendance, on="name", how="inner")

df_joined.show()


+------+-------+-----+-----+----------+----+----------+------------+
|  name|subject|score|grade|difficulty|rank|name_upper|days_present|
+------+-------+-----+-----+----------+----+----------+------------+
|Ananya|Science|   92|    A| Difficult|   1|    ANANYA|          18|
|  Ravi|   Math|   88|    B| Difficult|   2|      RAVI|          22|
|  Ravi|English|   67|    D|      Easy|   2|      RAVI|          22|
|  Neha|   Math|   94|    A| Difficult|   1|      NEHA|          21|
| Meena|Science|   85|    B| Difficult|   2|     MEENA|          19|
| Kavya|English|   79|    C|      Easy|   1|     KAVYA|          25|
+------+-------+-----+-----+----------+----+----------+------------+



In [36]:
from pyspark.sql.functions import when, col, udf
from pyspark.sql.types import StringType

grade_map = {
    "A": "B",
    "B": "C",
    "C": "D",
    "D": "D"
}

def downgrade(grade):
    return grade_map.get(grade, grade)

downgrade_udf = udf(lambda g: downgrade(g), StringType())

df_adjusted = df_joined.withColumn(
    "adjusted_grade",
    when(col("days_present") < 20, downgrade_udf(col("grade"))).otherwise(col("grade"))
)

df_adjusted.show()


+------+-------+-----+-----+----------+----+----------+------------+--------------+
|  name|subject|score|grade|difficulty|rank|name_upper|days_present|adjusted_grade|
+------+-------+-----+-----+----------+----+----------+------------+--------------+
|Ananya|Science|   92|    A| Difficult|   1|    ANANYA|          18|             B|
|  Ravi|   Math|   88|    B| Difficult|   2|      RAVI|          22|             B|
|  Ravi|English|   67|    D|      Easy|   2|      RAVI|          22|             D|
|  Neha|   Math|   94|    A| Difficult|   1|      NEHA|          21|             A|
| Meena|Science|   85|    B| Difficult|   2|     MEENA|          19|             C|
| Kavya|English|   79|    C|      Easy|   1|     KAVYA|          25|             C|
+------+-------+-----+-----+----------+----+----------+------------+--------------+



 # Partitioned Load (Full + Incremental)

In [37]:
df_scores.write.partitionBy("subject").parquet("/tmp/scores/")

In [38]:
incremental = [("Meena", "Math", 93)]
columns = ["name", "subject", "score"]
df_inc = spark.createDataFrame(incremental, columns)

df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")


 Task:

 List all folders inside

Read only
/tmp/scores/
 Math partition and display all entries.

In [39]:
# Read only 'Math' partition
df_math = spark.read.parquet("/tmp/scores/subject=Math")
df_math.show()

# List folders/files (for Colab only)
import os
print("Folders under /tmp/scores/:")
print(os.listdir("/tmp/scores"))


+-----+-----+-----+----------+----+----------+
| name|score|grade|difficulty|rank|name_upper|
+-----+-----+-----+----------+----+----------+
| Neha|   94|    A| Difficult|   1|      NEHA|
| Ravi|   88|    B| Difficult|   2|      RAVI|
|Meena|   93| NULL|      NULL|NULL|      NULL|
+-----+-----+-----+----------+----+----------+

Folders under /tmp/scores/:
['._SUCCESS.crc', 'subject=Science', 'subject=English', '_SUCCESS', 'subject=Math']


# ETL: Clean, Transform, Load

Tasks:

 Load data with header.

 Fill missing bonus with 2000.

 Calculate total_ctc = salary + bonus .

 Filter where total_ctc > 60,000.

 Save final DataFrame to Parquet and JSON.

In [40]:
csv_data = """emp_id,name,dept,salary,bonus
1,Arjun,IT,78000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,55000,3000
"""
with open("/content/employees_raw.csv", "w") as f:
    f.write(csv_data)

# Load with header
df_emp = spark.read.csv("/content/employees_raw.csv", header=True, inferSchema=True)
df_emp.show()


+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| NULL|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [41]:
df_emp = df_emp.fillna({"bonus": 2000})


In [44]:
df_emp = df_emp.withColumn("total_ctc", col("salary") + col("bonus"))
df_emp.show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 78000| 5000|    83000|
|     2|Kavya|     HR| 62000| 2000|    64000|
|     3|Sneha|Finance| 55000| 3000|    58000|
+------+-----+-------+------+-----+---------+



In [45]:
df_filtered = df_emp.filter(col("total_ctc") > 60000)
df_filtered.show()


+------+-----+----+------+-----+---------+
|emp_id| name|dept|salary|bonus|total_ctc|
+------+-----+----+------+-----+---------+
|     1|Arjun|  IT| 78000| 5000|    83000|
|     2|Kavya|  HR| 62000| 2000|    64000|
+------+-----+----+------+-----+---------+



In [46]:
df_filtered.write.mode("overwrite").parquet("/tmp/emp_final_parquet")
df_filtered.write.mode("overwrite").json("/tmp/emp_final_json")
