In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path("..").resolve()))

from src.data_ingestion import *
from src.data_preprocessing import *
from src.descriptive_analytics import *

from pyspark.sql import DataFrame
from pyspark.sql.types import NumericType, StringType
from pyspark.sql import functions as F

import seaborn as sns

import numpy as np

from itertools import combinations

from scipy import stats

import matplotlib.pyplot as plt

import pandas as pd



In [2]:
spark = init_spark()
df = load_data(spark)
df.printSchema()
# df.show(5)  

root
 |-- ID: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Severity: integer (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- End_Lat: double (nullable = true)
 |-- End_Lng: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Description: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- Airport_Code: string (nullable = true)
 |-- Weather_Timestamp: timestamp (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- V

In [3]:
# missing_values = check_missing_values(df)

## Cleaning Data

In [4]:
df = preprocess_data(df)

Features with one unique value: ['Country', 'Turning_Loop']


## Feature Engineering 


In [6]:
df.printSchema()

root
 |-- Severity: integer (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = false)
 |-- Start_Lng: double (nullable = false)
 |-- Distance(mi): double (nullable = false)
 |-- State: string (nullable = false)
 |-- Timezone: string (nullable = false)
 |-- Temperature(F): double (nullable = false)
 |-- Humidity(%): double (nullable = false)
 |-- Pressure(in): double (nullable = false)
 |-- Visibility(mi): double (nullable = false)
 |-- Wind_Speed(mph): double (nullable = false)
 |-- Weather_Condition: string (nullable = false)
 |-- Amenity: boolean (nullable = true)
 |-- Bump: boolean (nullable = true)
 |-- Crossing: boolean (nullable = true)
 |-- Give_Way: boolean (nullable = true)
 |-- Junction: boolean (nullable = true)
 |-- No_Exit: boolean (nullable = true)
 |-- Railway: boolean (nullable = true)
 |-- Roundabout: boolean (nullable = true)
 |-- Station: boolean (nullable = true)
 |-- Stop: boo

In [None]:
def convert_temp_to_celsius(df, temp_col="Temperature(F)", new_col="Temperature(C)"):
    return df.withColumn(new_col, (F.col(temp_col) - 32) * 5 / 9)

df = convert_temp_to_celsius(df)

In [None]:
df = df.withColumn(
    "Is_Complex_Road",
    F.when((F.col("Junction").cast("int") + F.col("Railway").cast("int") + F.col("Crossing").cast("int")) > 0, 1).otherwise(0)
)

In [None]:
# Extract time-based features
df = df.withColumn("Hour", F.hour("Start_Time"))
df = df.withColumn("DayOfWeek", F.dayofweek("Start_Time"))  # Sunday = 1
df = df.withColumn("Month", F.month("Start_Time"))

# Calculate duration in minutes
df = df.withColumn("Duration", (F.unix_timestamp("End_Time") - F.unix_timestamp("Start_Time")) / 60)

# Add Is_Weekend column: Saturday (7) or Sunday (1)
df = df.withColumn("Is_Weekend", F.when(F.col("DayOfWeek").isin(1, 7), 1).otherwise(0))