In [1]:
import sys
from pathlib import Path

sys.path.append(str(Path("../..").resolve()))

from src.data_ingestion import *
from src.data_preprocessing import *

from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.types import NumericType, StringType
from pyspark.sql import functions as F

import seaborn as sns

import numpy as np

from itertools import combinations

from scipy import stats

import matplotlib.pyplot as plt

import pandas as pd
from pyspark.sql.window import Window

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier




In [2]:
spark = init_spark()
df = load_data(spark, "../../data/US_Accidents_March23.csv")

In [18]:
df = preprocess_data(df)

Features with one unique value: []


In [4]:
df = df.withColumn("Hour", F.hour("Start_Time"))
df = df.withColumn("DayOfWeek", F.dayofweek("Start_Time"))  # Sunday = 1
df = df.withColumn("Month", F.month("Start_Time"))

In [17]:
df = df.drop("Start_Lat", "Start_Lng", "Start_Time", "End_Time")


In [6]:
# 4. Define categorical columns for encoding
categorical_cols = ["State", "Timezone", "Weather_Condition", "Sunrise_Sunset"]
indexers = [StringIndexer(inputCol=col, outputCol=col + "_Index", handleInvalid="keep") for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=col + "_Index", outputCol=col + "_Vec") for col in categorical_cols]

# 5. Define numeric + boolean + derived time columns
numeric_cols = [
    "Distance(mi)", "Temperature(F)", "Humidity(%)", "Pressure(in)",
    "Visibility(mi)", "Wind_Speed(mph)", "Hour", "DayOfWeek", "Month"
]

boolean_cols = [
    "Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit", "Railway",
    "Roundabout", "Station", "Stop", "Traffic_Calming", "Traffic_Signal"
]

# Fill null boolean columns with False (assumption)
for col in boolean_cols:
    df = df.withColumn(col, F.when(F.col(col).isNull(), F.lit(False)).otherwise(F.col(col).cast("boolean")))

# 6. Assemble all features into a vector
assembler_inputs = numeric_cols + boolean_cols + [col + "_Vec" for col in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# 7. Define the full pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler])

# 8. Fit and transform the data
model_ready_df = pipeline.fit(df).transform(df)

# 9. Select final dataset for modeling
final_df = model_ready_df.select("features", "Severity")

In [None]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

rf = RandomForestClassifier(labelCol="Severity", featuresCol="features")
rf_model = rf.fit(train_df)
predictions = rf_model.transform(test_df)

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Accuracy
accuracy_eval = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="prediction", metricName="accuracy"
)
accuracy = accuracy_eval.evaluate(predictions)

# F1 Score
f1_eval = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="prediction", metricName="f1"
)
f1 = f1_eval.evaluate(predictions)

# Precision
precision_eval = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="prediction", metricName="weightedPrecision"
)
precision = precision_eval.evaluate(predictions)

# Recall
recall_eval = MulticlassClassificationEvaluator(
    labelCol="Severity", predictionCol="prediction", metricName="weightedRecall"
)
recall = recall_eval.evaluate(predictions)

print(f"Accuracy:  {accuracy:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")


Accuracy:  0.7965
F1 Score:  0.7063
Precision: 0.6345
Recall:    0.7965


In [24]:
from pyspark.ml.classification import LogisticRegression, OneVsRest

# Define base classifier
lr = LogisticRegression(labelCol="Severity", featuresCol="features", maxIter=10)

# Wrap in One-vs-Rest
ovr = OneVsRest(classifier=lr, labelCol="Severity", featuresCol="features")
ovr_model = ovr.fit(train_df)

# Predictions
ovr_predictions = ovr_model.transform(test_df)


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Users\Ahmed Osama\AppData\Local\Programs\Python\Python313\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\Ahmed Osama\AppData\Local\Programs\Python\Python313\Lib\socket.py", line 719, in readinto
    return self._sock.recv_into(b)
           ~~~~~~~~~~~~~~~~~~~~^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\Ahmed Osama\AppData\Local\Programs\Python\Python313\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "c:\Users\Ahmed Osama\AppData\Local\Programs\Python\Python313\Lib\site-packages\py4j\clientserver.py", lin

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Loop over each class to compute AUC individually
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName="areaUnderROC")

# Store AUCs
auc_scores = []

for i in range(4):  # assuming classes are [1, 2, 3, 4]
    bin_df = test_df.withColumn("label", F.when(F.col("Severity") == i, 1.0).otherwise(0.0))
    bin_model = LogisticRegression(labelCol="label", featuresCol="features").fit(train_df.withColumn("label", F.when(F.col("Severity") == i, 1.0).otherwise(0.0)))
    bin_pred = bin_model.transform(bin_df)
    auc = evaluator.evaluate(bin_pred)
    auc_scores.append((f"Class {i}", auc))

# Show AUC for each class
for cls, auc in auc_scores:
    print(f"{cls} AUC: {auc:.4f}")


# Binary Classification for Severity 

In [10]:
from pyspark.sql.functions import when

binary_df = final_df.withColumn(
    "label", when(F.col("Severity").isin(3, 4), 1).otherwise(0)
)
small_df = binary_df.sample(fraction=0.2, seed=42)
train_df, test_df = small_df.randomSplit([0.8, 0.2], seed=42)

In [11]:
from pyspark.ml.classification import (
    LogisticRegression, RandomForestClassifier,
    GBTClassifier, DecisionTreeClassifier
)

models = {
    "Logistic Regression": LogisticRegression(labelCol="label", featuresCol="features", maxIter=10),
    "Random Forest": RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20),
    "Gradient-Boosted Trees": GBTClassifier(labelCol="label", featuresCol="features", maxIter=20),
    "Decision Tree": DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5),
}

# Train all models
fitted_models = {name: model.fit(train_df) for name, model in models.items()}
# Predict with each
predictions = {name: model.transform(test_df) for name, model in fitted_models.items()}


In [12]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

def evaluate_model(name, pred_df):
    evaluator_auc = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
    evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", metricName="f1")
    evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")

    auc = evaluator_auc.evaluate(pred_df)
    f1 = evaluator_f1.evaluate(pred_df)
    acc = evaluator_acc.evaluate(pred_df)

    return {"Model": name, "AUC": auc, "F1 Score": f1, "Accuracy": acc}

results = [evaluate_model(name, pred_df) for name, pred_df in predictions.items()]
for r in results:
    print(f"{r['Model']}: AUC={r['AUC']:.4f}, F1={r['F1 Score']:.4f}, Accuracy={r['Accuracy']:.4f}")


Logistic Regression: AUC=0.7297, F1=0.7492, Accuracy=0.8077
Random Forest: AUC=0.7509, F1=0.7184, Accuracy=0.8053
Gradient-Boosted Trees: AUC=0.8089, F1=0.7660, Accuracy=0.8192
Decision Tree: AUC=0.6452, F1=0.7438, Accuracy=0.8112


In [15]:
model_save_paths = {
    "Logistic Regression": "./logistic_regression_model",
    "Random Forest": "./random_forest_model",
    "Gradient-Boosted Trees": "./gbt_model",
    "Decision Tree": "./decision_tree_model",
}

for name, model in fitted_models.items():
    model.save(model_save_paths[name])


Py4JJavaError: An error occurred while calling o1981.save.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:286)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
	at org.apache.hadoop.mapred.FileOutputCommitter.setupJob(FileOutputCommitter.java:131)
	at org.apache.hadoop.mapred.OutputCommitter.setupJob(OutputCommitter.java:265)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
	at org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:79)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopDataset$1(PairRDDFunctions.scala:1091)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1089)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$4(PairRDDFunctions.scala:1062)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1027)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$3(PairRDDFunctions.scala:1009)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:1008)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$saveAsHadoopFile$2(PairRDDFunctions.scala:965)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:963)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$2(RDD.scala:1623)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1623)
	at org.apache.spark.rdd.RDD.$anonfun$saveAsTextFile$1(RDD.scala:1609)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1609)
	at org.apache.spark.ml.util.DefaultParamsWriter$.saveMetadata(ReadWrite.scala:413)
	at org.apache.spark.ml.classification.LogisticRegressionModel$LogisticRegressionModelWriter.saveImpl(LogisticRegression.scala:1311)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:168)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
	at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:242)
	at org.apache.spark.util.SparkFileUtils.createTempDir(SparkFileUtils.scala:103)
	at org.apache.spark.util.SparkFileUtils.createTempDir$(SparkFileUtils.scala:102)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:94)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:377)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:969)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:199)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:222)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1125)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1134)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
	at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
	at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
	... 25 more
