## Data Preprocessing

In [1]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col, to_date,mean, when

In [2]:
# Initialize SparkSession
spark = SparkSession.builder.appName("Weather_Data").getOrCreate()

In [3]:
# Load weather data 
weather_data = spark.read.csv("data/weather.csv", header=True)

In [4]:
weather_data.show()

+-----------+--------------+----------+----+----+----+----+
|    STATION|          NAME|      DATE|PRCP|TAVG|TMAX|TMIN|
+-----------+--------------+----------+----+----+----+----+
|CEM00043441|KURUNEGALA, CE|2019-01-02|0.00|  78|NULL|  72|
|CEM00043441|KURUNEGALA, CE|2019-01-03|0.00|  81|  87|  71|
|CEM00043441|KURUNEGALA, CE|2019-01-04|0.00|  83|  88|  69|
|CEM00043441|KURUNEGALA, CE|2019-01-05|0.00|  84|  90|  65|
|CEM00043441|KURUNEGALA, CE|2019-01-06|0.00|  82|  88|  71|
|CEM00043441|KURUNEGALA, CE|2019-01-07|0.00|  80|  89|  68|
|CEM00043441|KURUNEGALA, CE|2019-01-08|0.00|  82|  90|  71|
|CEM00043441|KURUNEGALA, CE|2019-01-09|0.00|  83|  89|  71|
|CEM00043441|KURUNEGALA, CE|2019-01-10|0.00|  81|  88|  72|
|CEM00043441|KURUNEGALA, CE|2019-01-11|0.00|  85|  90|  73|
|CEM00043441|KURUNEGALA, CE|2019-01-12|0.00|  84|  90|  74|
|CEM00043441|KURUNEGALA, CE|2019-01-13|0.00|  82|  87|  72|
|CEM00043441|KURUNEGALA, CE|2019-01-14|0.00|  80|  86|  74|
|CEM00043441|KURUNEGALA, CE|2019-01-15|0

In [5]:
weather_data.count()

5230

#### Handle Missing Values

In [6]:
# Explore missing values in each column
for column in weather_data.columns:
    missing_count =  weather_data.filter(col(column).isNull()).count()
    print(f"Missing values in column '{column}': {missing_count}")

Missing values in column 'STATION': 0
Missing values in column 'NAME': 0
Missing values in column 'DATE': 0
Missing values in column 'PRCP': 17
Missing values in column 'TAVG': 0
Missing values in column 'TMAX': 583
Missing values in column 'TMIN': 926


In [7]:
# Calculate mean values
prcp_mean = weather_data.agg(mean('PRCP')).collect()[0][0]
tmax_mean = weather_data.agg(mean('TMAX')).collect()[0][0]
tmin_mean = weather_data.agg(mean('TMIN')).collect()[0][0]

# Impute missing values with mean
weather_data= weather_data.fillna({'PRCP': prcp_mean, 'TMAX': tmax_mean, 'TMIN': tmin_mean})

In [8]:
# Explore missing values in each column
for column in weather_data.columns:
    missing_count =  weather_data.filter(col(column).isNull()).count()
    print(f"Missing values in column '{column}': {missing_count}")

Missing values in column 'STATION': 0
Missing values in column 'NAME': 0
Missing values in column 'DATE': 0
Missing values in column 'PRCP': 0
Missing values in column 'TAVG': 0
Missing values in column 'TMAX': 0
Missing values in column 'TMIN': 0


In [9]:
weather_data.printSchema()

root
 |-- STATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- PRCP: string (nullable = false)
 |-- TAVG: string (nullable = true)
 |-- TMAX: string (nullable = false)
 |-- TMIN: string (nullable = false)



#### Convert Data Types

In [10]:
# Convert DATE column to DateType
weather_data = weather_data.withColumn("DATE", to_date(col("DATE"), 'yyyy-MM-dd'))

# Convert PRCP, TAVG, TMAX, and TMIN columns to FloatType
weather_data = weather_data.withColumn("PRCP", col("PRCP").cast(FloatType()))
weather_data = weather_data.withColumn("TAVG", col("TAVG").cast(FloatType()))
weather_data = weather_data.withColumn("TMAX", col("TMAX").cast(FloatType()))
weather_data = weather_data.withColumn("TMIN", col("TMIN").cast(FloatType()))

In [11]:
weather_data.printSchema()

root
 |-- STATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- DATE: date (nullable = true)
 |-- PRCP: float (nullable = true)
 |-- TAVG: float (nullable = true)
 |-- TMAX: float (nullable = true)
 |-- TMIN: float (nullable = true)



#### Drop Unwanted Coloumns

In [12]:
# Drop the Next_Date column
weather_data = weather_data.drop("STATION")

In [13]:
weather_data.show()

+--------------+----------+----+----+--------+----+
|          NAME|      DATE|PRCP|TAVG|    TMAX|TMIN|
+--------------+----------+----+----+--------+----+
|KURUNEGALA, CE|2019-01-02| 0.0|78.0|82.25931|72.0|
|KURUNEGALA, CE|2019-01-03| 0.0|81.0|    87.0|71.0|
|KURUNEGALA, CE|2019-01-04| 0.0|83.0|    88.0|69.0|
|KURUNEGALA, CE|2019-01-05| 0.0|84.0|    90.0|65.0|
|KURUNEGALA, CE|2019-01-06| 0.0|82.0|    88.0|71.0|
|KURUNEGALA, CE|2019-01-07| 0.0|80.0|    89.0|68.0|
|KURUNEGALA, CE|2019-01-08| 0.0|82.0|    90.0|71.0|
|KURUNEGALA, CE|2019-01-09| 0.0|83.0|    89.0|71.0|
|KURUNEGALA, CE|2019-01-10| 0.0|81.0|    88.0|72.0|
|KURUNEGALA, CE|2019-01-11| 0.0|85.0|    90.0|73.0|
|KURUNEGALA, CE|2019-01-12| 0.0|84.0|    90.0|74.0|
|KURUNEGALA, CE|2019-01-13| 0.0|82.0|    87.0|72.0|
|KURUNEGALA, CE|2019-01-14| 0.0|80.0|    86.0|74.0|
|KURUNEGALA, CE|2019-01-15| 0.0|81.0|82.25931|72.0|
|KURUNEGALA, CE|2019-01-16| 0.0|82.0|    89.0|74.0|
|KURUNEGALA, CE|2019-01-17| 0.0|80.0|82.25931|72.0|
|KURUNEGALA,

#### Replace Station Names 

In [14]:
# Replace values in the NAME column

weather_data = weather_data.na.replace(['COLOMBO, CE'], ['Colombo Proper'], 'NAME')
weather_data = weather_data.na.replace(['KURUNEGALA, CE'], ['Kurunegala Proper'], 'NAME')
weather_data = weather_data.na.replace(['NUWARA ELIYA, CE'], ['Nuwara Eliya Proper'], 'NAME')

In [15]:
weather_data.show()

+-----------------+----------+----+----+--------+----+
|             NAME|      DATE|PRCP|TAVG|    TMAX|TMIN|
+-----------------+----------+----+----+--------+----+
|Kurunegala Proper|2019-01-02| 0.0|78.0|82.25931|72.0|
|Kurunegala Proper|2019-01-03| 0.0|81.0|    87.0|71.0|
|Kurunegala Proper|2019-01-04| 0.0|83.0|    88.0|69.0|
|Kurunegala Proper|2019-01-05| 0.0|84.0|    90.0|65.0|
|Kurunegala Proper|2019-01-06| 0.0|82.0|    88.0|71.0|
|Kurunegala Proper|2019-01-07| 0.0|80.0|    89.0|68.0|
|Kurunegala Proper|2019-01-08| 0.0|82.0|    90.0|71.0|
|Kurunegala Proper|2019-01-09| 0.0|83.0|    89.0|71.0|
|Kurunegala Proper|2019-01-10| 0.0|81.0|    88.0|72.0|
|Kurunegala Proper|2019-01-11| 0.0|85.0|    90.0|73.0|
|Kurunegala Proper|2019-01-12| 0.0|84.0|    90.0|74.0|
|Kurunegala Proper|2019-01-13| 0.0|82.0|    87.0|72.0|
|Kurunegala Proper|2019-01-14| 0.0|80.0|    86.0|74.0|
|Kurunegala Proper|2019-01-15| 0.0|81.0|82.25931|72.0|
|Kurunegala Proper|2019-01-16| 0.0|82.0|    89.0|74.0|
|Kurunegal

In [16]:
location_counts=weather_data.groupBy("Name").count()
location_counts.show()

+-------------------+-----+
|               Name|count|
+-------------------+-----+
|  Kurunegala Proper| 1704|
|     Colombo Proper| 1765|
|Nuwara Eliya Proper| 1761|
+-------------------+-----+



In [17]:
# Save the DataFrame as a CSV file
weather_data.toPandas().to_csv('data/weather_new.csv')