# Module 1: Setup & SparkSession Initialization

In [1]:
# Download Spark
!wget https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz


--2025-08-05 09:17:30--  https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400395283 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.0-bin-hadoop3.tgz’


2025-08-05 10:01:18 (149 KB/s) - ‘spark-3.5.0-bin-hadoop3.tgz’ saved [400395283/400395283]



In [74]:
# Extract Spark
!tar -xzf spark-3.5.0-bin-hadoop3.tgz

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("BotCampus PySpark Practice") \
    .master("local[*]") \
    .getOrCreate()

Create Dataframe

In [5]:
data = [
    ("Anjali", "Bangalore", 24),
    ("Ravi", "Hyderabad", 28),
    ("Kavya", "Delhi", 22),
    ("Meena", "Chennai", 25),
    ("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)
df.show()


+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+



 Show schema, explain data types, and convert to RDD.

In [75]:
#Schema and datatypes
df.printSchema()

# conversion to RDD
rdd = df.rdd
print("RDD content:", rdd.collect())


root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- bonus: integer (nullable = false)
 |-- total_ctc: integer (nullable = true)

RDD content: [Row(emp_id=1, name='Arjun', dept='IT', salary=75000, bonus=5000, total_ctc=80000), Row(emp_id=2, name='Kavya', dept='HR', salary=62000, bonus=2000, total_ctc=64000), Row(emp_id=3, name='Sneha', dept='Finance', salary=68000, bonus=4000, total_ctc=72000), Row(emp_id=4, name='Ramesh', dept='Sales', salary=58000, bonus=2000, total_ctc=60000)]


Print .collect() and df.rdd.map() output.

In [7]:
print(df.rdd.collect())  # full data
print(df.rdd.map(lambda x: (x.name, x.city)).collect())  # mapped example


[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]
[('Anjali', 'Bangalore'), ('Ravi', 'Hyderabad'), ('Kavya', 'Delhi'), ('Meena', 'Chennai'), ('Arjun', 'Mumbai')]


# Module 2: RDDs & Transformations

In [8]:
feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the delivery",
    "Meena from Hyderabad had a late order",
    "Ajay from Pune liked the service",
    "Anjali from Delhi faced UI issues",
    "Rohit from Mumbai gave positive feedback"
])

Task 1:  Split each line into words (
 flatMap ).

In [9]:
words = feedback.flatMap(lambda x: x.lower().split())

2.  Remove stop words (from , the ,etc.).

In [13]:
stopwords = {"From", "the", "a", "an", "had"}
filtered = words.filter(lambda x: x not in stopwords)

3. Count each word frequency using reduceByKey.

In [14]:
word_pairs = filtered.map(lambda word: (word,1))
word_counts = word_pairs.reduceByKey(lambda a, b: a+b)
word_counts.collect()

[('from', 5),
 ('loved', 1),
 ('liked', 1),
 ('service', 1),
 ('anjali', 1),
 ('faced', 1),
 ('issues', 1),
 ('rohit', 1),
 ('mumbai', 1),
 ('positive', 1),
 ('feedback', 1),
 ('ravi', 1),
 ('bangalore', 1),
 ('delivery', 1),
 ('meena', 1),
 ('hyderabad', 1),
 ('late', 1),
 ('order', 1),
 ('ajay', 1),
 ('pune', 1),
 ('delhi', 1),
 ('ui', 1),
 ('gave', 1)]

4. Find top 3 most frequent non- stop words

In [16]:
top3 = word_counts.takeOrdered(3, key=lambda x: -x[-1])
print("Top 3 frequent words:", top3)

Top 3 frequent words: [('from', 5), ('loved', 1), ('liked', 1)]


# Module 3: DataFrames & Transformation (With Joins)


In [18]:
students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
columns1= ["name", "section", "marks"]
columns2= ["name", "days_present"]

df_students = spark.createDataFrame(students, columns1)
df_attendance = spark.createDataFrame(attendance, columns2)

Tak 1: Join both DataFrames on
name .

In [73]:
df_joined = df_students.join(df_attendance, "name")
df_joined.show()


+------+-------+-----+-----+------------+
|  name|section|marks|grade|days_present|
+------+-------+-----+-----+------------+
|  Amit|   10-A|   89|    B|          24|
|Anjali|   10-A|   78|    C|          20|
| Kavya|   10-B|   92|    A|          22|
| Rohit|   10-B|   85|    B|          25|
| Sneha|   10-C|   80|    B|          19|
+------+-------+-----+-----+------------+



 2. Create a new column:
attendance_rate = days_present / 25 .

In [77]:
df_joined = df_joined.withColumn("attendance_rate", col("days_present")/25)
df_joined.select("name", "attendance_rate", "days_present").show()

+------+---------------+------------+
|  name|attendance_rate|days_present|
+------+---------------+------------+
|  Amit|           0.96|          24|
|Anjali|            0.8|          20|
| Kavya|           0.88|          22|
| Rohit|            1.0|          25|
| Sneha|           0.76|          19|
+------+---------------+------------+



3.  Grade students using
when :
 A: >90, B: 80–90, C: <80.

In [72]:
from pyspark.sql.functions import when, col

df_joined = df_joined.withColumn("grade",
            when(col("marks") > 90, "A")
            .when(col("marks") >=80, "B")
            .otherwise("C"))
df_joined.show()

+------+-------+-----+------------+-----+--------------+---------------+
|  name|section|marks|days_present|grade|attendace_rate|attendance_rate|
+------+-------+-----+------------+-----+--------------+---------------+
|  Amit|   10-A|   89|          24|    B|          0.96|           0.96|
|Anjali|   10-A|   78|          20|    C|           0.8|            0.8|
| Kavya|   10-B|   92|          22|    A|          0.88|           0.88|
| Rohit|   10-B|   85|          25|    B|           1.0|            1.0|
| Sneha|   10-C|   80|          19|    B|          0.76|           0.76|
+------+-------+-----+------------+-----+--------------+---------------+



4.  Filter students with good grades but poor attendance (<80%).

In [33]:
df_joined.filter((col("grade").isin("A", "B")) &(col("attendance_rate")<0.8)).show()

+-----+-------+-----+------------+-----+--------------+---------------+
| name|section|marks|days_present|grade|attendace_rate|attendance_rate|
+-----+-------+-----+------------+-----+--------------+---------------+
|Sneha|   10-C|   80|          19|    B|          0.76|           0.76|
+-----+-------+-----+------------+-----+--------------+---------------+



# Module 4: Ingest CSV & JSON, Save to Parquet

In [29]:
csv_content = """emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000"""
with open("employees.csv", "w") as f:
    f.write(csv_content)



In [30]:
json_content = """
{
  "id": 201,
  "name": "Nandini",
  "contact": {
    "email": "nandi@example.com",
    "city": "Hyderabad"
  },
  "skills": ["Python", "Spark", "SQL"]
}
"""
with open("employee.json", "w") as f:
    f.write(json_content)


Task 1: Read both formats into DataFrames.


In [36]:
df_csv = spark.read.csv("employees.csv", header=True, inferSchema=True)
df_csv.show()
df_json = spark.read.json("employee.json", multiLine=True)
df_json.show(truncate=False)

+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+

+------------------------------+---+-------+--------------------+
|contact                       |id |name   |skills              |
+------------------------------+---+-------+--------------------+
|{Hyderabad, nandi@example.com}|201|Nandini|[Python, Spark, SQL]|
+------------------------------+---+-------+--------------------+



2. Flatten nested JSON using
select ,
col ,
alias ,
explode

In [78]:
from pyspark.sql.functions import explode

df_json_flat = df_json.select(
    "id", "name",
    col("contact.email").alias("email"),
    col("contact.city").alias("city"),
    explode("skills").alias("skill"))
df_json_flat.show(truncate=False)

+---+-------+-----------------+---------+------+
|id |name   |email            |city     |skill |
+---+-------+-----------------+---------+------+
|201|Nandini|nandi@example.com|Hyderabad|Python|
|201|Nandini|nandi@example.com|Hyderabad|Spark |
|201|Nandini|nandi@example.com|Hyderabad|SQL   |
+---+-------+-----------------+---------+------+



3. Save both as Parquet files partitioned by city.

In [79]:
df_csv.write.mode("overwrite").partitionBy("city").parquet("/tmp/employees_csv")
df_json_flat.write.mode("overwrite").partitionBy("city").parquet("/tmp/employees_json")


# Module 5: Spark SQL with Temp Views

In [42]:
df_students.createOrReplaceTempView("students_view")

 Task 1: Average marks per section

In [43]:

spark.sql("select section, AVG(marks) as avg_marks from students_view group by section").show()

+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-A|     83.5|
|   10-B|     88.5|
|   10-C|     80.0|
+-------+---------+



 2. Top scorer in each section

In [48]:
spark.sql("""
      select section, name, marks from(
        select *, rank() over(partition by section order by marks desc) as rnk
        from students_view)
        where rnk = 1
        """).show()

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+



3. Count of students in each grade category

In [49]:
df_students = df_students.withColumn("grade",
                when(col("marks") > 90, "A")
                .when(col("marks") >=80, "B")
                .otherwise("C"))
df_students.createOrReplaceTempView("graded_students")
spark.sql("select grade, count(*) as count from graded_students group by grade")

DataFrame[grade: string, count: bigint]

 4. Students with marks above class average

In [50]:
spark.sql("""
  select * from students_view
  where marks>(select avg(marks) from students_view)""").show()

+-----+-------+-----+
| name|section|marks|
+-----+-------+-----+
| Amit|   10-A|   89|
|Kavya|   10-B|   92|
|Rohit|   10-B|   85|
+-----+-------+-----+



 5. Attendance-adjusted performance

In [51]:
df_combined = df_students.join(df_attendance, "name")
df_combined = df_combined.withColumn("adjusted_grade",
    when(col("days_present") < 20, "D").otherwise(col("grade"))
)
df_combined.select("name", "section", "grade", "days_present", "adjusted_grade").show()


+------+-------+-----+------------+--------------+
|  name|section|grade|days_present|adjusted_grade|
+------+-------+-----+------------+--------------+
|  Amit|   10-A|    B|          24|             B|
|Anjali|   10-A|    C|          20|             C|
| Kavya|   10-B|    A|          22|             A|
| Rohit|   10-B|    B|          25|             B|
| Sneha|   10-C|    B|          19|             D|
+------+-------+-----+------------+--------------+



# Module 6: Partitioned Data & Incremental Loading

In [66]:
# Full Load
df_students.write.partitionBy("section").parquet("output/students/")


In [70]:
#Incremental Load
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")


Task 1: List files in
output/students/ using Python.

In [67]:
import os
print("Partitions:", os.listdir("output/students/"))



Partitions: ['._SUCCESS.crc', 'section=10-A', 'section=10-B', '_SUCCESS', 'section=10-C']


2.  Read only partition
10-A and list students.

In [68]:
df_10a = spark.read.parquet("output/students/section=10-A")
df_10a.show()
print("Total students in 10-A:", df_10a.count())


+------+-----+-----+
|  name|marks|grade|
+------+-----+-----+
|Anjali|   78|    C|
|  Amit|   89|    B|
+------+-----+-----+

Total students in 10-A: 2


3.  Compare before/after counts for section
10-A .

In [69]:
before_count = df_10a.count()
print("Before incremental load:", before_count)


Before incremental load: 2


In [71]:
# after incrementing the load
df_10a_updated = spark.read.parquet("output/students/section=10-A")
after_count = df_10a_updated.count()
print("After incremental load:", after_count)


After incremental load: 3


# Module 7: ETL Pipeline – End to End

In [55]:
csv_data = """emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,"""
with open("emp_data.csv", "w") as f:
    f.write(csv_data)


Task 1:  Load CSV with inferred schema.

In [56]:
df = spark.read.csv("emp_data.csv", header=True, inferSchema=True)

2. Fill null bonuses with
2000 .

In [57]:
df = df.fillna({"bonus": 2000})

3. Create
total_ctc = salary + bonus .

In [58]:
df = df.withColumn("total_ctc", col("salary") + col("bonus"))

4.  Filter employees with
total_ctc > 65000

In [59]:
df_final = df.filter(col("total_ctc") > 65000)

5.  Saving result in JSON and
 Parquet format partitioned by department

In [80]:
df_final.write.mode("overwrite").json("/tmp/final_emps_json")
df_final.write.mode("overwrite").partitionBy("dept").parquet("/tmp/final_emps_parquet")