In [1]:
import pyspark.sql as pyspark_sql
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType

In [2]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
# SparkSession creation
spark = pyspark_sql.SparkSession.builder.appName("Model_Training").getOrCreate()

In [4]:
map = {"Kandy Proper": "kandy_proper.csv",
        "Colombo Proper": "colombo_proper.csv",
        "Deniyaya, Matara": "deniyaya_matara.csv",
        "Nuwara Eliya Proper": "nuwaraeliya_proper.csv",
        "Kurunegala Proper": "kurunegala_proper.csv",
        "Bibile, Monaragala": "bibile_monaragala.csv",
        "Jaffna Proper": "jaffna_proper.csv"}

In [5]:
def get_dataframe(location):
    global map
    
    df = spark.read.csv(r"../data/processed/" + map[location], header=True, inferSchema=True)\
        .select("current_date", "hcho_reading")
    
    df_pandas = df.toPandas()
    
    # Convert the current_date column to datetime
    df_pandas["current_date"] = pd.to_datetime(df_pandas["current_date"])

    # Set date as the index
    df_pandas.set_index("current_date", inplace=True, drop=True)

    return df, df_pandas

## Checking for Stationarity (ADF Test)

In [6]:
# check stationarity for each location using the Augmented Dickey-Fuller test
def check_stationarity(df):
    data = df.select("hcho_reading").rdd.flatMap(lambda x: x).collect()
    result = adfuller(data)

    return (result[0], result[1], result[1]<=0.05)


In [7]:
schema = StructType([
    StructField("Location", StringType(), True),
    StructField("ADF Statistic", DoubleType(), True),
    StructField("p-value", DoubleType(), True),
    StructField("Stationary", BooleanType(), True)
])

stationarity_results = spark.createDataFrame([], schema)

In [8]:
# Define the schema for the DataFrame
schema = StructType([
    StructField("Location", StringType(), True),
    StructField("ADF Statistic", StringType(), True),
    StructField("p-value", StringType(), True),
    StructField("Stationary", StringType(), True)
])

for location in map.keys():
    df = get_dataframe(location)[0]
    adf_stat, p_value, stationary = check_stationarity(df)
    
    # Create a DataFrame with the schema defined above
    new_row = spark.createDataFrame([(location, str(adf_stat), str(p_value), str(stationary))], schema)

    # cast the p-value, ADF Statistic to double and Stationary to boolean
    new_row = new_row.withColumn("ADF Statistic", new_row["ADF Statistic"].cast(DoubleType()))
    new_row = new_row.withColumn("p-value", new_row["p-value"].cast(DoubleType()))
    new_row = new_row.withColumn("Stationary", new_row["Stationary"].cast(BooleanType()))
    
    # Add the new row to the DataFrame
    stationarity_results = stationarity_results.union(new_row)
                                                                     
# Show the results
stationarity_results.show(truncate=False)

+-------------------+-------------------+----------------------+----------+
|Location           |ADF Statistic      |p-value               |Stationary|
+-------------------+-------------------+----------------------+----------+
|Kandy Proper       |-15.538050510258087|2.177265418618071E-28 |true      |
|Colombo Proper     |-4.151523717889624 |7.950844999938813E-4  |true      |
|Deniyaya, Matara   |-17.603554820494328|3.9106918872215016E-30|true      |
|Nuwara Eliya Proper|-18.03552768185273 |2.6678826366100292E-30|true      |
|Kurunegala Proper  |-8.01730519039967  |2.1218755389068202E-12|true      |
|Bibile, Monaragala |-6.137559727998532 |8.117549701904656E-8  |true      |
|Jaffna Proper      |-12.284818459878359|8.070475658305322E-23 |true      |
+-------------------+-------------------+----------------------+----------+



As we can see all the data for all 7 locations are stationary

## Model Training