#TASK-1


##1. PySpark Setup & Initialization
###Exercise 1.1
Setup Spark:

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("BotCampus Intermediate Session").master("local[*]").getOrCreate()

###Exercise 1.2
 Load starter data:

In [2]:
data = [("Ananya", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25)]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



##2. RDDs & Transformations
###Exercise 2.1
 Create RDD from feedback:

In [9]:
feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the mobile app",
    "Meena from Delhi reported poor response time",
    "Ajay from Pune liked the delivery speed",
    "Ananya from Hyderabad had an issue with UI",
    "Rohit from Mumbai gave positive feedback"
])
#Count total number of words.
num_words=feedback.flatMap(lambda x: x.split(" ")).count()
print("Total number of words:", num_words)



Total number of words: 35


In [22]:
#Find top 3 most common words.
word_counts=feedback.flatMap(lambda line: line.lower().split()) .map(lambda word: (word, 1)) .reduceByKey(lambda a, b: a + b)
topwords=word_counts.takeOrdered(3,key=lambda x:-x[1])
print(topwords)



[('from', 5), ('the', 2), ('loved', 1)]


In [16]:
#Remove stop words ( from , with , the , etc.).
stop_words={"from", "with", "the", "an", "and", "had", "an", "is", "a"}
filtered_words=feedback.flatMap(lambda line: line.lower().split()) \
                         .filter(lambda word: word not in stop_words)
print(filtered_words.collect())



['ravi', 'bangalore', 'loved', 'mobile', 'app', 'meena', 'delhi', 'reported', 'poor', 'response', 'time', 'ajay', 'pune', 'liked', 'delivery', 'speed', 'ananya', 'hyderabad', 'issue', 'ui', 'rohit', 'mumbai', 'gave', 'positive', 'feedback']


In [20]:
#Create a dictionary of word → count
word_counts=feedback.flatMap(lambda line: line.lower().split()) .map(lambda word: (word, 1)) .reduceByKey(lambda a, b: a + b).collectAsMap()
print(word_counts)




{'from': 5, 'loved': 1, 'app': 1, 'poor': 1, 'response': 1, 'liked': 1, 'speed': 1, 'ananya': 1, 'an': 1, 'issue': 1, 'with': 1, 'rohit': 1, 'mumbai': 1, 'positive': 1, 'feedback': 1, 'ravi': 1, 'bangalore': 1, 'the': 2, 'mobile': 1, 'meena': 1, 'delhi': 1, 'reported': 1, 'time': 1, 'ajay': 1, 'pune': 1, 'delivery': 1, 'hyderabad': 1, 'had': 1, 'ui': 1, 'gave': 1}


#3. DataFrames – Transformations
##Exercise 3.1
 Create exam_scores DataFrame:

In [23]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sample").getOrCreate()
scores = [
    ("Ravi", "Math", 88),
    ("Ananya", "Science", 92),
    ("Kavya", "English", 79),
    ("Ravi", "English", 67),
    ("Neha", "Math", 94),
    ("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)
df_scores.show()


+------+-------+-----+
|  name|subject|score|
+------+-------+-----+
|  Ravi|   Math|   88|
|Ananya|Science|   92|
| Kavya|English|   79|
|  Ravi|English|   67|
|  Neha|   Math|   94|
| Meena|Science|   85|
+------+-------+-----+



In [25]:
#Add grade column ( >=90 → A, 80-89 → B, 70-79 → C, else D).
from pyspark.sql.functions import when, col
df_grades=df_scores.withColumn("grade",when(col("score") >= 90, "A").when((col("score") >= 80) & (col("score") <= 89), "B").when((col("score") >= 70) & (col("score") <= 79), "C").otherwise("D"))
df_grades.show()



+------+-------+-----+-----+
|  name|subject|score|grade|
+------+-------+-----+-----+
|  Ravi|   Math|   88|    B|
|Ananya|Science|   92|    A|
| Kavya|English|   79|    C|
|  Ravi|English|   67|    D|
|  Neha|   Math|   94|    A|
| Meena|Science|   85|    B|
+------+-------+-----+-----+



In [26]:
#Group by subject, find average score.
df_avg=df_grades.groupBy("subject").avg("score")
df_avg.show()


+-------+----------+
|subject|avg(score)|
+-------+----------+
|Science|      88.5|
|   Math|      91.0|
|English|      73.0|
+-------+----------+



In [27]:
#Use when and otherwise to classify subject difficulty ( Math/Science = Difficult).
df_difficulty=df_avg.withColumn("difficulty",when(col("avg(score)") >= 80, "Difficult").otherwise("Easy"))
df_difficulty.show()


+-------+----------+----------+
|subject|avg(score)|difficulty|
+-------+----------+----------+
|Science|      88.5| Difficult|
|   Math|      91.0| Difficult|
|English|      73.0|      Easy|
+-------+----------+----------+



In [30]:
#Rank students per subject using Window function.
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col
windowSpec=Window.partitionBy("subject").orderBy(col("score").desc())
df_ranked=df_grades.withColumn("rank", row_number().over(windowSpec))
df_ranked.show()


+------+-------+-----+-----+----+
|  name|subject|score|grade|rank|
+------+-------+-----+-----+----+
| Kavya|English|   79|    C|   1|
|  Ravi|English|   67|    D|   2|
|  Neha|   Math|   94|    A|   1|
|  Ravi|   Math|   88|    B|   2|
|Ananya|Science|   92|    A|   1|
| Meena|Science|   85|    B|   2|
+------+-------+-----+-----+----+



In [31]:
#Apply UDF to format names (e.g., make all uppercase).
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def to_upper(name):
    return name.upper()
upper_udf=udf(to_upper, StringType())
df_final=df_ranked.withColumn("name_upper", upper_udf(col("name")))
df_final.show()


+------+-------+-----+-----+----+----------+
|  name|subject|score|grade|rank|name_upper|
+------+-------+-----+-----+----+----------+
| Kavya|English|   79|    C|   1|     KAVYA|
|  Ravi|English|   67|    D|   2|      RAVI|
|  Neha|   Math|   94|    A|   1|      NEHA|
|  Ravi|   Math|   88|    B|   2|      RAVI|
|Ananya|Science|   92|    A|   1|    ANANYA|
| Meena|Science|   85|    B|   2|     MEENA|
+------+-------+-----+-----+----+----------+



#4. Ingest CSV & JSON – Save to Parquet

In [33]:
from google.colab import files
uploaded = files.upload()

Saving students.csv to students.csv


In [35]:
uploaded = files.upload()

Saving employee_nested.json to employee_nested.json


In [36]:
#Load both datasets into PySpark.
#Print schema and infer nested structure.
students_df=spark.read.csv("students.csv", header=True, inferSchema=True)
students_df.printSchema()
students_df.show()

employee_df=spark.read.json("employee_nested.json", multiLine=True)
employee_df.printSchema()
employee_df.show(truncate=False)




root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: integer (nullable = true)

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----------------+---+-----+---------------+
|address         |id |name |skills         |
+----------------+---+-----+---------------+
|{Mumbai, 400001}|101|Sneha|[Python, Spark]|
+----------------+---+-----+---------------+



In [40]:

#Flatten the JSON (use explode , select , alias ).
from pyspark.sql.functions import col, explode
flattened_employee = employee_df.select(col("id"),col("name"),col("address.city").alias("city"),col("address.pincode").alias("pincode"),explode(col("skills")).alias("skill"))
flattened_employee.show()



+---+-----+------+-------+------+
| id| name|  city|pincode| skill|
+---+-----+------+-------+------+
|101|Sneha|Mumbai| 400001|Python|
|101|Sneha|Mumbai| 400001| Spark|
+---+-----+------+-------+------+



In [41]:
#Convert both to Parquet and write to /tmp/output .
students_df.write.mode("overwrite").parquet("/tmp/output/students.parquet")
flattened_employee.write.mode("overwrite").parquet("/tmp/output/employee.parquet")

#5. Spark SQL – Temp Views & Queries
##Exercise 5.1
Create view from exam scores and run:

In [49]:
df_scores.createOrReplaceTempView("exam_scores")

In [52]:
#a)Top scorer per subject
spark.sql("SELECT name, subject, score FROM (SELECT *, RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rank FROM exam_scores) WHERE rank = 1").show()


#c) Students with multiple subjects
spark.sql("SELECT name, COUNT(DISTINCT subject) as subject_count FROM exam_scores GROUP BY name HAVING COUNT(DISTINCT subject) > 1").show()

#d) Subjects with average score above 85
spark.sql("SELECT subject, AVG(score) as average_score FROM exam_scores GROUP BY subject HAVING AVG(score) > 85").show()

+------+-------+-----+
|  name|subject|score|
+------+-------+-----+
| Kavya|English|   79|
|  Neha|   Math|   94|
|Ananya|Science|   92|
+------+-------+-----+

+----+-------------+
|name|subject_count|
+----+-------------+
|Ravi|            2|
+----+-------------+

+-------+-------------+
|subject|average_score|
+-------+-------------+
|Science|         88.5|
|   Math|         91.0|
+-------+-------------+



In [55]:
#b) Count of students per grade
spark.sql("SELECT CASE WHEN score >= 90 THEN 'A' WHEN score >= 80 THEN 'B' WHEN score >= 70 THEN 'C'WHEN score >= 60 THEN 'D'ELSE 'F'END AS grade, COUNT(*) AS student_count FROM exam_scores GROUP BY grade").show()


+-----+-------------+
|grade|student_count|
+-----+-------------+
|    B|            2|
|    C|            1|
|    A|            2|
|    D|            1|
+-----+-------------+



##Exercise 5.2
Create another DataFrame attendance(name, days_present)

In [56]:
attendance_data = [
    ("Ravi", 18),
    ("Ananya", 22),
    ("Kavya", 19),
    ("Neha", 25),
    ("Meena", 15)
]

df_attendance=spark.createDataFrame(attendance_data, ["name", "days_present"])


In [62]:
df_joined=df_grades.join(df_attendance, on="name", how="left")

In [63]:
from pyspark.sql.functions import when, col,expr

df_final = df_joined.withColumn(
    "adjusted_grade",
    when(col("days_present") < 20,
         when(col("grade") == "A", "B")
         .when(col("grade") == "B", "C")
         .when(col("grade") == "C", "D")
         .when(col("grade") == "D", "F")
         .otherwise("F"))
    .otherwise(col("grade"))
)
df_final.show()

+------+-------+-----+-----+------------+--------------+
|  name|subject|score|grade|days_present|adjusted_grade|
+------+-------+-----+-----+------------+--------------+
|Ananya|Science|   92|    A|          22|             A|
|  Ravi|   Math|   88|    B|          18|             C|
| Kavya|English|   79|    C|          19|             D|
|  Ravi|English|   67|    D|          18|             F|
|  Neha|   Math|   94|    A|          25|             A|
| Meena|Science|   85|    B|          15|             C|
+------+-------+-----+-----+------------+--------------+




##6. Partitioned Load (Full + Incremental)

In [64]:
df_scores.write.partitionBy("subject").parquet("/tmp/scores/")

In [65]:
incremental=[("Meena", "Math", 93)]
df_inc=spark.createDataFrame(incremental, columns)
df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

In [67]:
#List all folders inside /tmp/scores/
import os
print(os.listdir("/tmp/scores/"))
#Read only Math partition and display all entries.
math_df=spark.read.parquet("/tmp/scores/subject=Math")
math_df.show()


['._SUCCESS.crc', 'subject=Science', 'subject=English', '_SUCCESS', 'subject=Math']
+-----+-----+
| name|score|
+-----+-----+
|Meena|   93|
| Neha|   94|
| Ravi|   88|
+-----+-----+



##7. ETL: Clean, Transform, Load

In [68]:
from google.colab import files
uploaded=files.upload()

Saving employees.csv to employees.csv


In [76]:
#Load data with header.
df_employee=spark.read.csv("employees.csv", header=True, inferSchema=True)
df_employee.show()


+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| NULL|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [77]:
#Fill missing bonus with 2000.
df_employee=df_employee.fillna(2000, subset=["bonus"])
df_employee.show()


+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| 2000|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



In [78]:
#Calculate total_ctc = salary + bonus .
df_employee=df_employee.withColumn("total_ctc", col("salary") + col("bonus"))
df_employee.show()


+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 78000| 5000|    83000|
|     2|Kavya|     HR| 62000| 2000|    64000|
|     3|Sneha|Finance| 55000| 3000|    58000|
+------+-----+-------+------+-----+---------+



In [79]:
#Filter where total_ctc > 60,000.
df_filtered=df_employee.filter(col("total_ctc") > 60000)
df_filtered.show()


+------+-----+----+------+-----+---------+
|emp_id| name|dept|salary|bonus|total_ctc|
+------+-----+----+------+-----+---------+
|     1|Arjun|  IT| 78000| 5000|    83000|
|     2|Kavya|  HR| 62000| 2000|    64000|
+------+-----+----+------+-----+---------+



In [80]:
#Save final DataFrame to Parquet and JSON.
df_filtered.write.mode("overwrite").parquet("/tmp/output/employees.parquet")
df_filtered.write.mode("overwrite").json("/tmp/output/employees.json")