# Module 1: Setup & SparkSession Initialization
 Tasks:

 Install and configure PySpark in your local system or Colab.

 Initialize Spark with:

 Create a DataFrame from:

 Show schema, explain data types, and convert to RDD.

 Print
.collect() and
df.rdd.map() output.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("BotCampus PySpark Practice") \
    .master("local[*]") \
    .getOrCreate()

In [2]:
data = [
    ("Anjali", "Bangalore", 24),
    ("Ravi", "Hyderabad", 28),
    ("Kavya", "Delhi", 22),
    ("Meena", "Chennai", 25),
    ("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]

df = spark.createDataFrame(data, columns)

In [3]:
# Show schema and data types
df.printSchema()

# Convert to RDD
rdd = df.rdd
print("\nRDD:", rdd.collect())

# Example transformation on RDD
print("\nName column from RDD:", rdd.map(lambda row: row.name).collect())

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)


RDD: [Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]

Name column from RDD: ['Anjali', 'Ravi', 'Kavya', 'Meena', 'Arjun']


 # Module 2: RDDs & Transformations

 Tasks:

 Split each line into words ( flatMap ).

 Remove stop words (from , the , etc.).

 Count each word frequency using reduceByKey

Find top 3 most frequent non-stop words.


In [4]:
feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the delivery",
    "Meena from Hyderabad had a late order",
    "Ajay from Pune liked the service",
    "Anjali from Delhi faced UI issues",
    "Rohit from Mumbai gave positive feedback"
])


In [16]:
# Stop words
stop_words = {"from", "the", "and", "a", "to", "of", "in", "had"}

# remove stopwords
words_rdd = feedback.flatMap(lambda line: line.lower().split()) \
    .filter(lambda w: w not in stop_words)


In [14]:
# Count word frequency
word_counts = words_rdd.map(lambda w: (w, 1)) \
    .reduceByKey(lambda a, b: a + b)

In [7]:
# Top 3 most frequent
top3 = word_counts.takeOrdered(3, key=lambda x: -x[1])

print("Top 3 frequent words:", top3)

Top 3 frequent words: [('loved', 1), ('liked', 1), ('service', 1)]


# Module 3: DataFrames & Transformation (With Joins)

In [18]:
from pyspark.sql.functions import col, when

# Students
students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
columns1 = ["name", "section", "marks"]

# Attendance
attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
columns2 = ["name", "days_present"]

df_students = spark.createDataFrame(students, columns1)
df_attendance = spark.createDataFrame(attendance, columns2)

Tasks:
 Join both DataFrames on
name .

 Create a new column:
attendance_rate = days_present / 25 .

 Grade students using
when :
 A: >90, B: 80–90, C: <80.
 Filter students with good grades but poor attendance (<80%).

In [21]:
# Join on name
df_joined = df_students.join(df_attendance, on="name", how="inner")

df_joined.show()

+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+



In [22]:
# Attendance rate
df_joined = df_joined.withColumn("attendance_rate", col("days_present") / 25)

df_joined.show()

+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendance_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           0.96|
|Anjali|   10-A|   78|          20|            0.8|
| Kavya|   10-B|   92|          22|           0.88|
| Rohit|   10-B|   85|          25|            1.0|
| Sneha|   10-C|   80|          19|           0.76|
+------+-------+-----+------------+---------------+



In [23]:
df_joined = df_joined.withColumn(
    "grade",
    when(col("marks") > 90, "A")
    .when((col("marks") >= 80) & (col("marks") <= 90), "B")
    .otherwise("C")
)
df_joined.show()

+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendance_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           0.96|    B|
|Anjali|   10-A|   78|          20|            0.8|    C|
| Kavya|   10-B|   92|          22|           0.88|    A|
| Rohit|   10-B|   85|          25|            1.0|    B|
| Sneha|   10-C|   80|          19|           0.76|    B|
+------+-------+-----+------------+---------------+-----+



# Module 4: Ingest CSV & JSON, Save to Parquet

In [24]:
from pyspark.sql.functions import explode

# Create sample CSV
csv_data = """emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000
"""
with open("/content/employees.csv", "w") as f:
    f.write(csv_data)

# Create sample JSON
import json
json_data = {
    "id": 201,
    "name": "Nandini",
    "contact": {"email": "nandi@example.com", "city": "Hyderabad"},
    "skills": ["Python", "Spark", "SQL"]
}
with open("/content/employee.json", "w") as f:
    json.dump(json_data, f)

Tasks:
 Read both formats into DataFrames.

Flatten nested JSON using
select ,
col ,
alias ,
explode .

 Save both as Parquet files partitioned by city.

In [25]:
# Read CSV
df_csv = spark.read.csv("/content/employees.csv", header=True, inferSchema=True)

# Read JSON
df_json = spark.read.json("/content/employee.json", multiLine=True)

In [26]:
df_flat = df_json.select(
    "id",
    "name",
    col("contact.email").alias("email"),
    col("contact.city").alias("city"),
    explode("skills").alias("skill")
)

In [27]:
df_csv.write.mode("overwrite").partitionBy("city").parquet("/content/output/employees_csv")
df_flat.write.mode("overwrite").partitionBy("city").parquet("/content/output/employees_json")

# Module 5: Spark SQL with Temp Views
 Tasks:

 Register the
students DataFrame as
students_view .

 Write and run the following queries:
 -- a) Average marks per section

 -- b) Top scorer in each section

 -- c) Count of students in each grade category

 -- d) Students with marks above class average

 -- e) Attendance-adjusted performance

In [28]:
# Register students DataFrame as SQL view
df_students.createOrReplaceTempView("students_view")

# a) Average marks per section
spark.sql("""
SELECT section, AVG(marks) AS avg_marks
FROM students_view
GROUP BY section
""").show()


+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-A|     83.5|
|   10-B|     88.5|
|   10-C|     80.0|
+-------+---------+



In [29]:
# b) Top scorer in each section
spark.sql("""
SELECT section, name, marks
FROM (
    SELECT section, name, marks,
           RANK() OVER (PARTITION BY section ORDER BY marks DESC) as rank
    FROM students_view
) WHERE rank = 1
""").show()

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+



In [30]:
# c) Count of students in each grade category
spark.sql("""
SELECT
    CASE
        WHEN marks > 90 THEN 'A'
        WHEN marks BETWEEN 80 AND 90 THEN 'B'
        ELSE 'C'
    END AS grade,
    COUNT(*) AS student_count
FROM students_view
GROUP BY grade
""").show()

+-----+-------------+
|grade|student_count|
+-----+-------------+
|    B|            3|
|    A|            1|
|    C|            1|
+-----+-------------+



In [31]:
# d) Students with marks above class average
spark.sql("""
SELECT name, section, marks
FROM students_view
WHERE marks > (SELECT AVG(marks) FROM students_view)
""").show()

+-----+-------+-----+
| name|section|marks|
+-----+-------+-----+
| Amit|   10-A|   89|
|Kavya|   10-B|   92|
|Rohit|   10-B|   85|
+-----+-------+-----+



 # Module 6: Partitioned Data & Incremental Loading

In [32]:
# Full Load
df_students.write.mode("overwrite").partitionBy("section").parquet("/content/output/students/")

# Incremental Load
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("/content/output/students/")


 Tasks:
 List files in
output/students/ using Python.

 Read only partition
10-A and list students.

 Compare before/after counts for section
10-A.

In [33]:
import os
print("Files in output/students/:", os.listdir("/content/output/students/"))


Files in output/students/: ['._SUCCESS.crc', 'section=10-A', 'section=10-B', '_SUCCESS', 'section=10-C']


In [34]:
# Read only partition 10-A
df_10A = spark.read.parquet("/content/output/students/section=10-A")
df_10A.show()

+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
|  Amit|   89|
+------+-----+



 # Module 7: ETL Pipeline – End to End
  Tasks:
 Load CSV with inferred schema.

 Fill null bonuses with
2000 .

 Create
total_ctc = salary + bonus .

 Filter employees with
total_ctc > 65000 .

 Save result in:
JSON format.

 Parquet format partitioned by department

In [35]:
# Create raw CSV
raw_csv = """emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,
"""
with open("/content/emp_raw.csv", "w") as f:
    f.write(raw_csv)

# Load CSV
df_emp = spark.read.csv("/content/emp_raw.csv", header=True, inferSchema=True)

In [36]:
df_emp = df_emp.fillna({"bonus": 2000})
df_emp.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| 2000|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| 2000|
+------+------+-------+------+-----+



In [37]:
df_emp = df_emp.withColumn("total_ctc", col("salary") + col("bonus"))
df_emp.show()

+------+------+-------+------+-----+---------+
|emp_id|  name|   dept|salary|bonus|total_ctc|
+------+------+-------+------+-----+---------+
|     1| Arjun|     IT| 75000| 5000|    80000|
|     2| Kavya|     HR| 62000| 2000|    64000|
|     3| Sneha|Finance| 68000| 4000|    72000|
|     4|Ramesh|  Sales| 58000| 2000|    60000|
+------+------+-------+------+-----+---------+



In [38]:
df_filtered = df_emp.filter(col("total_ctc") > 65000)
df_filtered.show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 75000| 5000|    80000|
|     3|Sneha|Finance| 68000| 4000|    72000|
+------+-----+-------+------+-----+---------+



In [39]:
df_filtered.write.mode("overwrite").json("/content/output/emp_json")

In [40]:
# Save in Parquet partitioned by department
df_filtered.write.mode("overwrite").partitionBy("dept").parquet("/content/output/emp_parquet")