In [1]:
import sys
from pathlib import Path

sys.path.append(str(Path("..").resolve()))

from src.data_ingestion import *
from src.data_preprocessing import *
from src.descriptive_analytics import *

from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.types import NumericType, StringType
from pyspark.sql import functions as F

import seaborn as sns

import numpy as np

from itertools import combinations

from scipy import stats

import matplotlib.pyplot as plt

import pandas as pd



In [6]:
spark = init_spark()
df = load_data(spark)
df.printSchema()
# df.show(5)  

root
 |-- ID: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Severity: integer (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- End_Lat: double (nullable = true)
 |-- End_Lng: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Description: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- Airport_Code: string (nullable = true)
 |-- Weather_Timestamp: timestamp (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- V

## Cleaning Data

In [3]:
df = preprocess_data(df)

Features with one unique value: ['Country', 'Turning_Loop']


In [5]:
# Sample first in PySpark (small fraction of data)
sample_df = df.sample(fraction=0.05, seed=123)

# Convert the sample to Pandas
sample_pandas_df = sample_df.toPandas()

# Save the sample
sample_pandas_df.to_csv("us_accidents_sample.csv", index=False)

# Now if you still want to save the full cleaned data:
full_pandas_df = df.toPandas()
full_pandas_df.to_csv("us_accidents_cleaned.csv", index=False)


Py4JJavaError: An error occurred while calling o421.collectToPython.
: java.lang.OutOfMemoryError: GC overhead limit exceeded


## Feature Engineering 


In [None]:
from pyspark.sql.functions import col

def convert_temp_to_celsius(df, temp_col="Temperature(F)", new_col="Temperature(C)"):
    return df.withColumn(new_col, (col(temp_col) - 32) * 5 / 9)

df = convert_temp_to_celsius(df)

## Decriptive Analysis

In [None]:
from folium.plugins import HeatMap
import folium

# Sample the data for mapping (adjust fraction as needed)
sample_map_df = df.select("Start_Lat", "Start_Lng").dropna().sample(fraction=0.01).toPandas()

# Create heatmap
heat_data = sample_map_df[["Start_Lat", "Start_Lng"]].values.tolist()
center = [sample_map_df["Start_Lat"].mean(), sample_map_df["Start_Lng"].mean()]
map_all = folium.Map(location=center, zoom_start=5)
HeatMap(heat_data).add_to(map_all)

# Save and display
map_all.save("accidents_heatmap_all_states.html")

from IPython.display import IFrame
IFrame("accidents_heatmap_all_states.html", width="100%", height="500px")


### Severity Analysis

In [None]:
# Count accidents by severity
severity_counts = df.groupBy("Severity").count().orderBy("Severity")

# Convert to pandas for plotting
severity_pd = severity_counts.toPandas().sort_values("Severity")

# Plot as pie chart
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 8))
plt.pie(severity_pd['count'], labels=severity_pd['Severity'], autopct='%1.1f%%', startangle=90)
plt.title("Accidents by Severity (All States)")
plt.axis('equal')
plt.show()
