In [1]:
# Necessary imports
import findspark
findspark.init() # Find Spark installation

from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType, StringType, BooleanType, TimestampType, StructType, StructField

# For ML tasks (even if demonstrating MapReduce concepts, preprocessing often uses MLlib)
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, Bucketizer
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix # For potential matrix operations

import math
import heapq
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd 

import sys
from pathlib import Path

sys.path.append(str(Path("../..").resolve()))

from src.data_ingestion import *
from src.data_preprocessing import *
from src.descriptive_analytics import *

from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql import functions as F

import seaborn as sns

import numpy as np

from itertools import combinations

from scipy import stats

import matplotlib.pyplot as plt

import pandas as pd
from pyspark.sql.window import Window

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.linalg import Vectors




In [2]:
spark = init_spark()
df = load_data(spark, "../../data/US_Accidents_March23.csv")


In [3]:
from pyspark.sql.functions import hour, dayofweek, month, year, when

# Extract time-based features from Start_Time
df = df.withColumn("hour_of_day", hour(df["Start_Time"]))
df = df.withColumn("day_of_week", dayofweek(df["Start_Time"]))
df = df.withColumn("month", month(df["Start_Time"]))
df = df.withColumn("year", year(df["Start_Time"]))

# Handle categorical features - Weather_Condition
df = df.withColumn("weather_condition_cat", when(df["Weather_Condition"] == "Clear", 0)
                                            .when(df["Weather_Condition"] == "Rain", 1)
                                            .when(df["Weather_Condition"] == "Snow", 2)
                                            .otherwise(3))

# Add a binary feature for day/night based on the time of the accident
df = df.withColumn("is_night", when((df["hour_of_day"] >= 18) | (df["hour_of_day"] < 6), 1).otherwise(0))

# Create a boolean flag for severe accidents (Severity >= 3)
df = df.withColumn("severe_accident", when(df["Severity"] >= 3, 1).otherwise(0))

df.show(5)


+---+-------+--------+-------------------+-------------------+-----------------+------------------+-------+-------+------------+--------------------+--------------------+------------+----------+-----+----------+-------+----------+------------+-------------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+-----------+-----------+-----+----+---------------------+--------+---------------+
| ID| Source|Severity|         Start_Time|           End_Time|        Start_Lat|         Start_Lng|End_Lat|End_Lng|Distance(mi)|         Description|              Street|        City|    County|State|   Zipcode|Country|  Timezone|Airport_Code|  Weather_Timestamp|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visib

In [4]:
# Fill missing values in columns like temperature, wind speed with the mean or median
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=["Temperature(F)", "Wind_Speed(mph)", "Humidity(%)"], outputCols=["Temperature_imputed", "Wind_Speed_imputed", "Humidity_imputed"])
df = imputer.fit(df).transform(df)

# Drop rows with missing target variables (if any)
df = df.dropna(subset=["Severity"])


In [5]:
# Split the data into training and testing sets (80% train, 20% test)
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)


In [6]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

# Assemble the geographical features (latitude and longitude) into a single vector column
geo_assembler = VectorAssembler(inputCols=["Start_Lat", "Start_Lng"], outputCol="geo_features")
df = geo_assembler.transform(df)

# Train a KMeans model (KMeans++ initialization is used by default)
kmeans = KMeans(k=5, featuresCol="geo_features", predictionCol="cluster")
kmeans_model = kmeans.fit(df)

# Make predictions and assign clusters to the data
df = kmeans_model.transform(df)

# Show the cluster centers
print("Cluster Centers: ")
for center in kmeans_model.clusterCenters():
    print(center)

# Show the data with assigned clusters
df.select("Start_Lat", "Start_Lng", "cluster").show(5)


Cluster Centers: 
[ 31.27814441 -82.32015619]
[  40.54451899 -121.54860482]
[ 40.02010468 -77.31986112]
[ 36.17015069 -94.43016175]
[  34.51816798 -116.51956581]
+-----------------+------------------+-------+
|        Start_Lat|         Start_Lng|cluster|
+-----------------+------------------+-------+
|        39.865147|        -84.058723|      2|
|39.92805900000001|        -82.831184|      2|
|        39.063148|        -84.032608|      2|
|        39.747753|-84.20558199999998|      2|
|        39.627781|        -84.188354|      2|
+-----------------+------------------+-------+
only showing top 5 rows



In [None]:

# For KMeans, you can visualize the clusters' centers
import matplotlib.pyplot as plt
import numpy as np

# Convert cluster centers into a numpy array for visualization
cluster_centers = np.array(kmeans_model.clusterCenters())

# Plot the geographical clusters (if you want to visualize the clusters on a map)
plt.scatter(df.select("Start_Lat").toPandas(), df.select("Start_Lng").toPandas(), c=df.select("cluster").toPandas(), cmap="viridis")
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], marker="x", color="red", s=100, label="Cluster Centers")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.title("Accident Hotspots and Cluster Centers")
plt.legend()
plt.show()
