In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("Read").getOrCreate()

In [0]:

schema_student= StructType([StructField('id', IntegerType(), True),
                            StructField('name', StringType(), True),
                            StructField('score', IntegerType(), True),
                            StructField('courseId', IntegerType(), True)])
Student = [(1,'Alice', 95, 1),
           (2, 'Bob', 85, 1),
           (3, 'Charlie', 90, 2),
           (4, 'Dave', 70, 1),
           (5, 'Eve', 88, 2),
           (6, 'Frank', 80, 2),
           (7, 'Grace', 92, 2)]

df_student= spark.createDataFrame(Student, schema_student)
df_student.display()

schema_course= StructType([StructField('id', IntegerType(), True),
                           StructField('name', StringType(), True)])

Course = [(1, 'Math'),
          (2, 'Science')]

df_course= spark.createDataFrame(Course, schema_course)
df_course.display()


id,name,score,courseId
1,Alice,95,1
2,Bob,85,1
3,Charlie,90,2
4,Dave,70,1
5,Eve,88,2
6,Frank,80,2
7,Grace,92,2


id,name
1,Math
2,Science


In [0]:
df_student.createTempView('student')
df_course.createTempView('course')

We need to rank the students within each course based on their score in descending order (highest score gets rank 1).
We will then fetch the top 3 students in each course.

In [0]:
df_result = spark.sql(""" with ranked_student as (select s.name as student_name , c. name as course_name ,  s.score, dense_rank() over (partition by s.courseId order by s.score desc) as rank 
                      from student s join course c on s.courseId = c.id )
 select student_name , course_name , score from ranked_student where rank <= 2
""")
df_result.display()

student_name,course_name,score
Alice,Math,95
Bob,Math,85
Grace,Science,92
Charlie,Science,90


In [0]:
df_joined = df_student.join(df_course, df_student.courseId == df_course.id , "inner")\
    .select(df_student.name.alias ('student_name'), df_course.name.alias('course_name'), df_student.score )

window_spec = Window.partitionBy('course_name').orderBy(desc('score'))

df_ranked = df_joined.withColumn('rank', dense_rank().over(window_spec))

df_top_2 = df_ranked. filter(df_ranked.rank <=2)

df_top_2.display()

student_name,course_name,score,rank
Alice,Math,95,1
Bob,Math,85,2
Grace,Science,92,1
Charlie,Science,90,2
