In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path("..").resolve()))

from src.data_ingestion import *
from src.data_preprocessing import *
from src.feature_engineering import *
from src.visualization import *

In [2]:
spark = init_spark()
df = load_data(spark)
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Severity: integer (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- End_Lat: double (nullable = true)
 |-- End_Lng: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Description: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- Airport_Code: string (nullable = true)
 |-- Weather_Timestamp: timestamp (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- V

In [3]:
get_summary_statistics(df)

Summary statistics for numeric columns:
+-------+--------+-------+------------------+-----------------+------------------+-----------------+------------------+------------------+--------------------+------------------+----------+---------+-------+------------------+-------+----------+------------+------------------+-----------------+------------------+------------------+-----------------+--------------+-----------------+--------------------+------------------+--------------+--------------+-----------------+---------------------+
|summary|      ID| Source|          Severity|        Start_Lat|         Start_Lng|          End_Lat|           End_Lng|      Distance(mi)|         Description|            Street|      City|   County|  State|           Zipcode|Country|  Timezone|Airport_Code|    Temperature(F)|    Wind_Chill(F)|       Humidity(%)|      Pressure(in)|   Visibility(mi)|Wind_Direction|  Wind_Speed(mph)|   Precipitation(in)| Weather_Condition|Sunrise_Sunset|Civil_Twilight|Nautical_Tw

(DataFrame[summary: string, ID: string, Source: string, Severity: string, Start_Lat: string, Start_Lng: string, End_Lat: string, End_Lng: string, Distance(mi): string, Description: string, Street: string, City: string, County: string, State: string, Zipcode: string, Country: string, Timezone: string, Airport_Code: string, Temperature(F): string, Wind_Chill(F): string, Humidity(%): string, Pressure(in): string, Visibility(mi): string, Wind_Direction: string, Wind_Speed(mph): string, Precipitation(in): string, Weather_Condition: string, Sunrise_Sunset: string, Civil_Twilight: string, Nautical_Twilight: string, Astronomical_Twilight: string],
 DataFrame[ID: bigint, Source: bigint, Severity: bigint, Start_Time: bigint, End_Time: bigint, Start_Lat: bigint, Start_Lng: bigint, End_Lat: bigint, End_Lng: bigint, Distance(mi): bigint, Description: bigint, Street: bigint, City: bigint, County: bigint, State: bigint, Zipcode: bigint, Country: bigint, Timezone: bigint, Airport_Code: bigint, Weather

## Cleaning Data

In [4]:
df = preprocess_data(df)

In [5]:
df = preprocess_features(df)

In [6]:
df.printSchema()

root
 |-- Severity: integer (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = false)
 |-- Start_Lng: double (nullable = false)
 |-- Distance: double (nullable = false)
 |-- State: string (nullable = false)
 |-- Timezone: string (nullable = false)
 |-- Temperature: double (nullable = true)
 |-- Humidity: double (nullable = false)
 |-- Pressure: double (nullable = false)
 |-- Visibility: double (nullable = false)
 |-- Wind_Speed: double (nullable = false)
 |-- Weather_Condition: string (nullable = false)
 |-- Amenity: boolean (nullable = true)
 |-- Bump: boolean (nullable = true)
 |-- Crossing: boolean (nullable = true)
 |-- Give_Way: boolean (nullable = true)
 |-- Junction: boolean (nullable = true)
 |-- No_Exit: boolean (nullable = true)
 |-- Railway: boolean (nullable = true)
 |-- Roundabout: boolean (nullable = true)
 |-- Station: boolean (nullable = true)
 |-- Stop: boolean (nullable = true)
 

In [7]:
get_summary_statistics(df)

Summary statistics for numeric columns:
+-------+-------------------+-----------------+-----------------+------------------+-------+----------+------------------+------------------+------------------+-----------------+------------------+-----------------+--------------+-------------------+------------------+------------------+----------------+------------------+-------+-----------------+------------------+-------------------+-------------------+
|summary|           Severity|        Start_Lat|        Start_Lng|          Distance|  State|  Timezone|       Temperature|          Humidity|          Pressure|       Visibility|        Wind_Speed|Weather_Condition|Sunrise_Sunset|    Is_Complex_Road|              Hour|       Day_of_Week|           Month|              Year| Season|        DayOfWeek|          Duration|         Is_Weekend|          Is_Severe|
+-------+-------------------+-----------------+-----------------+------------------+-------+----------+------------------+------------------

(DataFrame[summary: string, Severity: string, Start_Lat: string, Start_Lng: string, Distance: string, State: string, Timezone: string, Temperature: string, Humidity: string, Pressure: string, Visibility: string, Wind_Speed: string, Weather_Condition: string, Sunrise_Sunset: string, Is_Complex_Road: string, Hour: string, Day_of_Week: string, Month: string, Year: string, Season: string, DayOfWeek: string, Duration: string, Is_Weekend: string, Is_Severe: string],
 DataFrame[Severity: bigint, Start_Time: bigint, End_Time: bigint, Start_Lat: bigint, Start_Lng: bigint, Distance: bigint, State: bigint, Timezone: bigint, Temperature: bigint, Humidity: bigint, Pressure: bigint, Visibility: bigint, Wind_Speed: bigint, Weather_Condition: bigint, Amenity: bigint, Bump: bigint, Crossing: bigint, Give_Way: bigint, Junction: bigint, No_Exit: bigint, Railway: bigint, Roundabout: bigint, Station: bigint, Stop: bigint, Traffic_Calming: bigint, Traffic_Signal: bigint, Sunrise_Sunset: bigint, Is_Complex_R