In [0]:
%spark.pyspark

from pyspark.sql.functions import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Task2Spark").getOrCreate()

locations = spark.read.csv("hdfs://namenode:8020/user/data/location/processed_location_data.csv", header=True, inferSchema=True)
weather = spark.read.csv("hdfs://namenode:8020/user/data/weather/processed_weather_data.csv", header=True, inferSchema=True)

weather = weather.withColumn("date", to_date(col("date")))


In [1]:
%spark.pyspark

# Add month column
weather_month = weather.withColumn("month", month(col("date")))

weather_month.show(2)


In [2]:
%spark.pyspark
# Total rows per month
monthly_total = weather_month.groupBy("month").agg(count("*").alias("total_count"))

# Rows above 15 
monthly_above = weather_month.filter(col("shortwave_radiation_sum") > 15).groupBy("month").agg(count("*").alias("above_15_count"))
radiation_percentage = monthly_total.join(monthly_above, "month", "left") .withColumn("percentage_above_15",(col("above_15_count") / col("total_count")) * 100).orderBy("month")
radiation_percentage.show() # Show percentage of radiation above 15

In [3]:
%spark.pyspark
weather2 = weather.withColumn("year", year(col("date"))).withColumn("month", month(col("date"))).withColumn("week", weekofyear(col("date")))

# compute average temperature per month per year
monthly_avg = weather2.groupBy("year", "month").agg(avg("temperature_2m_mean").alias("avg_temp"))
monthly_avg.show(10)

In [4]:
%spark.pyspark
from pyspark.sql.window import Window
w = Window.partitionBy("year").orderBy(col("avg_temp").desc())

hottest_months = monthly_avg.withColumn("rank", rank().over(w)).filter(col("rank") == 1).select("year", "month")
    
hottest_months.show()

In [5]:
%spark.pyspark
filtered_weather = weather2.join(hottest_months, ["year", "month"])
weekly_max_temp = filtered_weather.groupBy("year", "month", "week").agg(max("temperature_2m_max").alias("weekly_max_temp")).orderBy("year", "week")
weekly_max_temp.show()


In [6]:
%spark.pyspark
