In [11]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("problems").getOrCreate()

data = [
    ("Gryffindor", 1, 80, "Harry Potter"),
    ("Slytherin", 1, 60, "Draco Malfoy"),
    ("Ravenclaw", 1, 45, "Luna Lovegood"),
    ("Hufflepuff", 1, 30, "Cedric Diggory"),
    ("Gryffindor", 2, 90, "Hermione Granger"),
    ("Slytherin", 2, 70, "Pansy Parkinson"),
    ("Ravenclaw", 2, 55, "Cho Chang"),
    ("Hufflepuff", 2, 65, "Hannah Abbott"),
    ("Gryffindor", 3, 20, "Ron Weasley"),
    ("Slytherin", 3, 85, "Blaise Zabini")
]

columns = ["house", "year", "points", "student"]
df = spark.createDataFrame(data, columns)
df.show()

+----------+----+------+----------------+
|     house|year|points|         student|
+----------+----+------+----------------+
|Gryffindor|   1|    80|    Harry Potter|
| Slytherin|   1|    60|    Draco Malfoy|
| Ravenclaw|   1|    45|   Luna Lovegood|
|Hufflepuff|   1|    30|  Cedric Diggory|
|Gryffindor|   2|    90|Hermione Granger|
| Slytherin|   2|    70| Pansy Parkinson|
| Ravenclaw|   2|    55|       Cho Chang|
|Hufflepuff|   2|    65|   Hannah Abbott|
|Gryffindor|   3|    20|     Ron Weasley|
| Slytherin|   3|    85|   Blaise Zabini|
+----------+----+------+----------------+



In [23]:
from pyspark.sql import functions as F

selected_df = df.select("house", "year", "points")

filtered_df = selected_df.filter(selected_df.points > 50)

grouped_df = filtered_df.groupBy("house", "year").agg(F.sum("points").alias("total_points"))

result_df = grouped_df.orderBy("year", F.desc("total_points"))

result_df.show()


+----------+----+------------+
|     house|year|total_points|
+----------+----+------------+
|Gryffindor|   1|          80|
| Slytherin|   1|          60|
|Gryffindor|   2|          90|
| Slytherin|   2|          70|
|Hufflepuff|   2|          65|
| Ravenclaw|   2|          55|
| Slytherin|   3|          85|
+----------+----+------------+



In [15]:
data = [
    ("Aragorn", "Human", 10, 2, "Helms Deep"),
    ("Legolas", "Elf", 15, 0, "Helms Deep"),
    ("Gimli", "Dwarf", 8, 3, "Helms Deep"),
    ("Frodo", "Hobbit", 2, 1, "Moria"),
    ("Sam", "Hobbit", 4, 2, "Moria"),
    ("Gandalf", "Wizard", 12, 1, "Moria"),
    ("Boromir", "Human", 7, 4, "Amon Hen"),
    ("Legolas", "Elf", 20, 0, "Amon Hen"),
    ("Aragorn", "Human", 9, 2, "Amon Hen")
]

columns = ["name", "race", "enemies_defeated", "injuries", "battle"]
df1 = spark.createDataFrame(data, columns)
df1.show()

+-------+------+----------------+--------+----------+
|   name|  race|enemies_defeated|injuries|    battle|
+-------+------+----------------+--------+----------+
|Aragorn| Human|              10|       2|Helms Deep|
|Legolas|   Elf|              15|       0|Helms Deep|
|  Gimli| Dwarf|               8|       3|Helms Deep|
|  Frodo|Hobbit|               2|       1|     Moria|
|    Sam|Hobbit|               4|       2|     Moria|
|Gandalf|Wizard|              12|       1|     Moria|
|Boromir| Human|               7|       4|  Amon Hen|
|Legolas|   Elf|              20|       0|  Amon Hen|
|Aragorn| Human|               9|       2|  Amon Hen|
+-------+------+----------------+--------+----------+



In [29]:
# from pyspark.sql import functions as F

selected_df = df1.select("name", "race", "enemies_defeated")

filtered_df = selected_df.filter(selected_df.enemies_defeated > 5)

avg_df = filtered_df.groupBy("race").agg(F.avg("enemies_defeated").alias("avg_enemies_defeated"))

result_df = avg_df.orderBy(F.desc("avg_enemies_defeated"))

result_df.show()


+------+--------------------+
|  race|avg_enemies_defeated|
+------+--------------------+
|   Elf|                17.5|
|Wizard|                12.0|
| Human|   8.666666666666666|
| Dwarf|                 8.0|
+------+--------------------+



In [17]:
data = [
    ("SUB-01", "Pacific Strike", 5, "Success"),
    ("SUB-02", "Atlantic Surge", 2, "Failure"),
    ("SUB-01", "Arctic Blitz", 4, "Success"),
    ("SUB-03", "Indian Ocean", 6, "Success"),
    ("SUB-02", "Pacific Strike", 3, "Success"),
    ("SUB-01", "Coral Sea", 7, "Success"),
    ("SUB-03", "Arctic Blitz", 1, "Failure"),
    ("SUB-02", "Bering Strait", 5, "Success")
]

columns = ["submarine_id", "mission_name", "warheads_launched", "status"]
df2 = spark.createDataFrame(data, columns)
df2.show()

+------------+--------------+-----------------+-------+
|submarine_id|  mission_name|warheads_launched| status|
+------------+--------------+-----------------+-------+
|      SUB-01|Pacific Strike|                5|Success|
|      SUB-02|Atlantic Surge|                2|Failure|
|      SUB-01|  Arctic Blitz|                4|Success|
|      SUB-03|  Indian Ocean|                6|Success|
|      SUB-02|Pacific Strike|                3|Success|
|      SUB-01|     Coral Sea|                7|Success|
|      SUB-03|  Arctic Blitz|                1|Failure|
|      SUB-02| Bering Strait|                5|Success|
+------------+--------------+-----------------+-------+



In [28]:
# from pyspark.sql import functions as F
selected_df = df2.select("submarine_id","mission_name","warheads_launched")
filtered_df = selected_df.filter(
    (F.col("warheads_launched") > 3) & (F.col("status") == "Success")
)
grouped_df = filtered_df.groupBy("submarine_id").agg(
    F.count("mission_name").alias("total_missions"),
    F.sum("warheads_launched").alias("total_warheads")
)
result_df = grouped_df.orderBy(F.desc("total_warheads"))
result_df.show()

+------------+--------------+--------------+
|submarine_id|total_missions|total_warheads|
+------------+--------------+--------------+
|      SUB-01|             3|            16|
|      SUB-03|             1|             6|
|      SUB-02|             1|             5|
+------------+--------------+--------------+

