In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Student-and-courses').getOrCreate()

# Student data
students_data = [
    (1, "Rahul Sharma", 20, "Bangalore"),
    (2, "Priya Singh", 21, "Delhi"),
    (3, "Aman Kumar", 19, "Hyderabad"),
    (4, "Sneha Reddy", 22, "Chennai"),
    (5, "Arjun Mehta", 23, "Mumbai"),
    (6, "Divya Nair", 20, None)   # Student without city
]

students_cols = ["student_id", "name", "age", "city"]
students_df = spark.createDataFrame(students_data, students_cols)

# Course data
courses_data = [
    (101, "Python", "Programming"),
    (102, "Data Science", "Analytics"),
    (103, "Databases", "Technology"),
    (104, "Business Studies", "Management")
]

courses_cols = ["course_id", "course_name", "category"]
courses_df = spark.createDataFrame(courses_data, courses_cols)

# Enrollment data
enrollment_data = [
    (1, 101, "A"),
    (2, 101, "B"),
    (3, 102, "A"),
    (4, 103, "C"),
    (5, 102, "B"),
    (7, 104, "A")   # Enrollment with non-existent student
]

enrollment_cols = ["student_id", "course_id", "grade"]
enrollment_df = spark.createDataFrame(enrollment_data, enrollment_cols)

# Show all DataFrames
students_df.show()
courses_df.show()
enrollment_df.show()


+----------+------------+---+---------+
|student_id|        name|age|     city|
+----------+------------+---+---------+
|         1|Rahul Sharma| 20|Bangalore|
|         2| Priya Singh| 21|    Delhi|
|         3|  Aman Kumar| 19|Hyderabad|
|         4| Sneha Reddy| 22|  Chennai|
|         5| Arjun Mehta| 23|   Mumbai|
|         6|  Divya Nair| 20|     NULL|
+----------+------------+---+---------+

+---------+----------------+-----------+
|course_id|     course_name|   category|
+---------+----------------+-----------+
|      101|          Python|Programming|
|      102|    Data Science|  Analytics|
|      103|       Databases| Technology|
|      104|Business Studies| Management|
+---------+----------------+-----------+

+----------+---------+-----+
|student_id|course_id|grade|
+----------+---------+-----+
|         1|      101|    A|
|         2|      101|    B|
|         3|      102|    A|
|         4|      103|    C|
|         5|      102|    B|
|         7|      104|    A|
+--------

In [3]:
# 1. Select all student names and their cities
students_df.select("name", "city").show()

# 2. Find students who are older than 20
students_df.filter(students_df.age > 20).show()

# 3. List all courses under the "Analytics" category
courses_df.filter(courses_df.category == "Analytics").show()


+------------+---------+
|        name|     city|
+------------+---------+
|Rahul Sharma|Bangalore|
| Priya Singh|    Delhi|
|  Aman Kumar|Hyderabad|
| Sneha Reddy|  Chennai|
| Arjun Mehta|   Mumbai|
|  Divya Nair|     NULL|
+------------+---------+

+----------+-----------+---+-------+
|student_id|       name|age|   city|
+----------+-----------+---+-------+
|         2|Priya Singh| 21|  Delhi|
|         4|Sneha Reddy| 22|Chennai|
|         5|Arjun Mehta| 23| Mumbai|
+----------+-----------+---+-------+

+---------+------------+---------+
|course_id| course_name| category|
+---------+------------+---------+
|      102|Data Science|Analytics|
+---------+------------+---------+



In [4]:
# 1. Count how many students are enrolled in each course
enrollment_df.groupBy("course_id").count().show()

# 2. Find the average age of students per city
students_df.groupBy("city").avg("age").show()

# 3. Get the maximum and minimum age of students
students_df.agg({"age": "max", "age": "min"}).show()


+---------+-----+
|course_id|count|
+---------+-----+
|      101|    2|
|      102|    2|
|      103|    1|
|      104|    1|
+---------+-----+

+---------+--------+
|     city|avg(age)|
+---------+--------+
|Bangalore|    20.0|
|    Delhi|    21.0|
|Hyderabad|    19.0|
|  Chennai|    22.0|
|     NULL|    20.0|
|   Mumbai|    23.0|
+---------+--------+

+--------+
|min(age)|
+--------+
|      19|
+--------+



In [5]:
# 1. Join students with enrollments to see which student took which course
students_df.join(enrollment_df, "student_id", "inner").show()

# 2. Left join enrollments with courses to get course details
enrollment_df.join(courses_df, "course_id", "left").show()

# 3. Find students who are not enrolled in any course
students_df.join(enrollment_df, "student_id", "left_anti").show()

# 4. Find courses with no students enrolled
courses_df.join(enrollment_df, "course_id", "left_anti").show()


+----------+------------+---+---------+---------+-----+
|student_id|        name|age|     city|course_id|grade|
+----------+------------+---+---------+---------+-----+
|         1|Rahul Sharma| 20|Bangalore|      101|    A|
|         2| Priya Singh| 21|    Delhi|      101|    B|
|         3|  Aman Kumar| 19|Hyderabad|      102|    A|
|         4| Sneha Reddy| 22|  Chennai|      103|    C|
|         5| Arjun Mehta| 23|   Mumbai|      102|    B|
+----------+------------+---+---------+---------+-----+

+---------+----------+-----+----------------+-----------+
|course_id|student_id|grade|     course_name|   category|
+---------+----------+-----+----------------+-----------+
|      101|         1|    A|          Python|Programming|
|      101|         2|    B|          Python|Programming|
|      102|         3|    A|    Data Science|  Analytics|
|      103|         4|    C|       Databases| Technology|
|      104|         7|    A|Business Studies| Management|
|      102|         5|    B|   

In [6]:
students_df.createOrReplaceTempView("students")
courses_df.createOrReplaceTempView("courses")
enrollment_df.createOrReplaceTempView("enrollments")


In [7]:
# 1. Get all students with their course names and grades
spark.sql("""
SELECT s.name, c.course_name, e.grade
FROM students s
JOIN enrollments e ON s.student_id = e.student_id
JOIN courses c ON e.course_id = c.course_id
""").show()

# 2. Find the number of students who got grade "A" in each course
spark.sql("""
SELECT c.course_name, COUNT(*) as num_A_students
FROM enrollments e
JOIN courses c ON e.course_id = c.course_id
WHERE e.grade = 'A'
GROUP BY c.course_name
""").show()

# 3. Find the top city with the most students enrolled in courses
spark.sql("""
SELECT s.city, COUNT(*) as total_enrolled
FROM students s
JOIN enrollments e ON s.student_id = e.student_id
GROUP BY s.city
ORDER BY total_enrolled DESC
LIMIT 1
""").show()


+------------+------------+-----+
|        name| course_name|grade|
+------------+------------+-----+
| Priya Singh|      Python|    B|
|Rahul Sharma|      Python|    A|
| Arjun Mehta|Data Science|    B|
|  Aman Kumar|Data Science|    A|
| Sneha Reddy|   Databases|    C|
+------------+------------+-----+

+----------------+--------------+
|     course_name|num_A_students|
+----------------+--------------+
|Business Studies|             1|
|          Python|             1|
|    Data Science|             1|
+----------------+--------------+

+---------+--------------+
|     city|total_enrolled|
+---------+--------------+
|Bangalore|             1|
+---------+--------------+

