In [17]:
""" Hogwarts House Points Analysis 
Problem Description:
At Hogwarts School of Witchcraft and Wizardry, the four houses (Gryffindor, Hufflepuff, Ravenclaw, Slytherin) 
earn points for their achievements throughout the year. You’ve been tasked by Professor Dumbledore to analyze the
house points data to determine which house is leading and summarize points by house and year. Select the relevant 
columns (house, year, points), filter for houses with more than 50 points in a given year, group by house and year to sum the points, and order the results by year and total points in descending order to see who’s dominating the House Cup race. The Great Hall is buzzing with anticipation for these results.
"""
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("HogwartsAnalysis").getOrCreate()

data = [
    ("Gryffindor", 1, 80, "Harry Potter"),
    ("Slytherin", 1, 60, "Draco Malfoy"),
    ("Ravenclaw", 1, 45, "Luna Lovegood"),
    ("Hufflepuff", 1, 30, "Cedric Diggory"),
    ("Gryffindor", 2, 90, "Hermione Granger"),
    ("Slytherin", 2, 70, "Pansy Parkinson"),
    ("Ravenclaw", 2, 55, "Cho Chang"),
    ("Hufflepuff", 2, 65, "Hannah Abbott"),
    ("Gryffindor", 3, 20, "Ron Weasley"),
    ("Slytherin", 3, 85, "Blaise Zabini")
]

columns = ["house", "year", "points", "student"]
df = spark.createDataFrame(data, columns)
df.show() 


+----------+----+------+----------------+
|     house|year|points|         student|
+----------+----+------+----------------+
|Gryffindor|   1|    80|    Harry Potter|
| Slytherin|   1|    60|    Draco Malfoy|
| Ravenclaw|   1|    45|   Luna Lovegood|
|Hufflepuff|   1|    30|  Cedric Diggory|
|Gryffindor|   2|    90|Hermione Granger|
| Slytherin|   2|    70| Pansy Parkinson|
| Ravenclaw|   2|    55|       Cho Chang|
|Hufflepuff|   2|    65|   Hannah Abbott|
|Gryffindor|   3|    20|     Ron Weasley|
| Slytherin|   3|    85|   Blaise Zabini|
+----------+----+------+----------------+



In [19]:

from pyspark.sql.functions import sum, count
res=df.select("house","year","points").filter(df.points>50).groupBy(["house","year"]).agg(sum("points").alias("total_points")).orderBy(["year","total_points"],ascending=[True, False])
res.show()

+----------+----+------------+
|     house|year|total_points|
+----------+----+------------+
|Gryffindor|   1|          80|
| Slytherin|   1|          60|
|Gryffindor|   2|          90|
| Slytherin|   2|          70|
|Hufflepuff|   2|          65|
| Ravenclaw|   2|          55|
| Slytherin|   3|          85|
+----------+----+------------+



In [9]:
"""Problem 2: Fellowship of the Ring Battle Stats 
Problem Description:
The Fellowship of the Ring is battling across Middle-earth, and Gandalf needs your 
help to analyze their combat performance. The data tracks each member’s battles, enemies defeated,
and injuries sustained. To prepare for the next council in Rivendell, select the member’s name, race, and enemies defeated, 
filter for battles where more than 5 enemies were defeated, group by race to calculate the average enemies defeated, 
and order by the average in descending order.
This will help the council decide which races are most effective in combat and plan their strategy against Sauron’s forces.
"""
data = [
    ("Aragorn", "Human", 10, 2, "Helms Deep"),
    ("Legolas", "Elf", 15, 0, "Helms Deep"),
    ("Gimli", "Dwarf", 8, 3, "Helms Deep"),
    ("Frodo", "Hobbit", 2, 1, "Moria"),
    ("Sam", "Hobbit", 4, 2, "Moria"),
    ("Gandalf", "Wizard", 12, 1, "Moria"),
    ("Boromir", "Human", 7, 4, "Amon Hen"),
    ("Legolas", "Elf", 20, 0, "Amon Hen"),
    ("Aragorn", "Human", 9, 2, "Amon Hen")
]

columns = ["name", "race", "enemies_defeated", "injuries", "battle"]
df = spark.createDataFrame(data, columns)
df.show()


+-------+------+----------------+--------+----------+
|   name|  race|enemies_defeated|injuries|    battle|
+-------+------+----------------+--------+----------+
|Aragorn| Human|              10|       2|Helms Deep|
|Legolas|   Elf|              15|       0|Helms Deep|
|  Gimli| Dwarf|               8|       3|Helms Deep|
|  Frodo|Hobbit|               2|       1|     Moria|
|    Sam|Hobbit|               4|       2|     Moria|
|Gandalf|Wizard|              12|       1|     Moria|
|Boromir| Human|               7|       4|  Amon Hen|
|Legolas|   Elf|              20|       0|  Amon Hen|
|Aragorn| Human|               9|       2|  Amon Hen|
+-------+------+----------------+--------+----------+



In [13]:
from pyspark.sql.functions import avg
res=df.select("name","race","enemies_defeated","injuries").filter(df.enemies_defeated>5).groupBy("race").agg(avg("enemies_defeated").alias("Average_enemies_defeated")).orderBy("Average_enemies_defeated",ascending=False)
res.show()

+------+------------------------+
|  race|Average_enemies_defeated|
+------+------------------------+
|   Elf|                    17.5|
|Wizard|                    12.0|
| Human|       8.666666666666666|
| Dwarf|                     8.0|
+------+------------------------+

