In [3]:
! pip install pyspark



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg,sum,col,pow

spark=SparkSession.builder.appName("weather_data").getOrCreate()

weather_df=spark.read.csv("/content/weather_data.csv", header=True, inferSchema=True)

In [6]:
# 1. Find the Average Temperature for Each City
avg_temperature=weather_df.groupBy("city").agg(avg("temperature_c").alias("avg_temperature"))
print("Average temperature for each city: ")
avg_temperature.show()

Average temperature for each city: 
+-----------+-------------------+
|       city|    avg_temperature|
+-----------+-------------------+
|Los Angeles| 17.666666666666668|
|    Chicago|-2.6666666666666665|
|   New York|  4.666666666666667|
+-----------+-------------------+



In [7]:
# 2.Filter Days with Temperature Below Freezing
freezing_days=weather_df.filter(col("temperature_c")<0)
print("Days with temperature below freezing: ")
freezing_days.show()

Days with temperature below freezing: 
+----------+-------+-------------+--------+--------------+---------+
|      date|   city|temperature_c|humidity|wind_speed_kph|condition|
+----------+-------+-------------+--------+--------------+---------+
|2023-01-01|Chicago|           -2|      75|            25|     Snow|
|2023-01-02|Chicago|           -5|      80|            30|     Snow|
|2023-01-03|Chicago|           -1|      70|            18|   Cloudy|
+----------+-------+-------------+--------+--------------+---------+



In [8]:
# 3.Find the City with the Highest Wind Speed on a Specific Day
highest_wind_speed=weather_df.orderBy(col("wind_speed_kph").desc()).limit(1)
print("City with the highest wind speed on a specific day: ")
highest_wind_speed.show()

City with the highest wind speed on a specific day: 
+----------+-------+-------------+--------+--------------+---------+
|      date|   city|temperature_c|humidity|wind_speed_kph|condition|
+----------+-------+-------------+--------+--------------+---------+
|2023-01-02|Chicago|           -5|      80|            30|     Snow|
+----------+-------+-------------+--------+--------------+---------+



In [9]:
# 4. Calculate the Total Number of Days with Rainy Weather
rainy_days_count = weather_df.filter(weather_df["condition"] == "Rain").count()
print(f"Total number of rainy days: {rainy_days_count}")

Total number of rainy days: 1


In [11]:
# 5. Calculate the Average Humidity for Each Weather Condition
avg_humidity=weather_df.groupBy("condition").agg(avg("humidity").alias("avg_humidity"))
print("Average humidity for each weather condition: ")
avg_humidity.show()

Average humidity for each weather condition: 
+---------+------------+
|condition|avg_humidity|
+---------+------------+
|   Cloudy|        65.0|
|    Sunny|        42.0|
|     Snow|        77.5|
|     Rain|        65.0|
+---------+------------+



In [10]:
# 6. Find the Hottest Day in Each City
hottest_day=weather_df.orderBy(col("temperature_c").desc()).limit(1)
print("Hottest day in each city: ")
hottest_day.show()

Hottest day in each city: 
+----------+-----------+-------------+--------+--------------+---------+
|      date|       city|temperature_c|humidity|wind_speed_kph|condition|
+----------+-----------+-------------+--------+--------------+---------+
|2023-01-03|Los Angeles|           20|      38|            12|    Sunny|
+----------+-----------+-------------+--------+--------------+---------+



In [15]:
# 7. Identify Cities That Experienced Snow
snow_cities=weather_df.filter(weather_df["condition"]=="Snow").select("city").distinct()
print("Cities that experienced snow: ")
snow_cities.show()

Cities that experienced snow: 
+-------+
|   city|
+-------+
|Chicago|
+-------+



In [14]:
# 8. Calculate the Average Wind Speed for Days When the Condition was Sunny
avg_wind_speed=weather_df.filter(weather_df["condition"]=="Sunny").agg(avg("wind_speed_kph").alias("avg_wind_speed"))
print("Average wind speed for days when the condition was sunny: ")
avg_wind_speed.show()

Average wind speed for days when the condition was sunny: 
+--------------+
|avg_wind_speed|
+--------------+
|          13.0|
+--------------+



In [13]:
# 9. Find the Coldest Day Across All Cities
coldest_day=weather_df.orderBy(col("temperature_c").asc()).limit(1)
print("Coldest day across all cities: ")
coldest_day.show()

Coldest day across all cities: 
+----------+-------+-------------+--------+--------------+---------+
|      date|   city|temperature_c|humidity|wind_speed_kph|condition|
+----------+-------+-------------+--------+--------------+---------+
|2023-01-02|Chicago|           -5|      80|            30|     Snow|
+----------+-------+-------------+--------+--------------+---------+



In [12]:
# 10. Create a New Column for Wind Chill
data_with_wind_chill = weather_df.withColumn("wind_chill",
    13.12 + 0.6215 * col("temperature_c") - 11.37 * pow(col("wind_speed_kph"), 0.16) +
    0.3965 * col("temperature_c") * pow(col("wind_speed_kph"), 0.16))
print("New dataset after adding column")
data_with_wind_chill.show()

New dataset after adding column
+----------+-----------+-------------+--------+--------------+---------+-------------------+
|      date|       city|temperature_c|humidity|wind_speed_kph|condition|         wind_chill|
+----------+-----------+-------------+--------+--------------+---------+-------------------+
|2023-01-01|   New York|            5|      60|            20|   Cloudy| 1.0669572525115663|
|2023-01-01|Los Angeles|           15|      40|            10|    Sunny| 14.604602843130213|
|2023-01-01|    Chicago|           -2|      75|            25|     Snow| -8.479874917414646|
|2023-01-02|   New York|            3|      65|            15|     Rain|-0.7170927775232809|
|2023-01-02|Los Angeles|           18|      35|             8|    Sunny| 18.403050060338735|
|2023-01-02|    Chicago|           -5|      80|            30|     Snow|-12.996724811921073|
|2023-01-03|   New York|            6|      55|            22|    Sunny| 2.1056686374397366|
|2023-01-03|Los Angeles|           20|