In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path("../..").resolve()))

from src.data_ingestion import *
from src.data_preprocessing import *


from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.types import NumericType, StringType
from pyspark.sql import functions as F

import seaborn as sns

import numpy as np

from itertools import combinations

from scipy import stats

import matplotlib.pyplot as plt

import pandas as pd
from pyspark.sql.window import Window

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier




In [13]:
spark = init_spark()
df = load_data(spark, "../../data/US_Accidents_March23.csv")

In [14]:
# df = preprocess_data(df)

In [None]:
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline


state_features = df.groupBy("State").agg(
    F.avg("Visibility(mi)").alias("Avg_Visibility"),
    F.avg(F.when(F.col("Sunrise_Sunset") == "Night", 1).otherwise(0)).alias("Prop_Night_Accidents"),
    F.avg("Precipitation(in)").alias("Avg_Precipitation"),
    F.avg("Temperature(F)").alias("Avg_Temperature"),
    F.avg("Distance(mi)").alias("Avg_Accident_Distance"),
    F.countDistinct("City").alias("Num_Unique_Cities"),
    F.avg(F.unix_timestamp("End_Time") - F.unix_timestamp("Start_Time")).alias("Avg_Accident_Duration_Seconds"),
    F.count("*").alias("Total_Accidents"),
    F.avg("Severity").alias("Avg_Severity")
)

state_features = state_features.withColumn(
    "Risk_Score",
    F.col("Total_Accidents") * F.col("Avg_Severity")
)

state_features = state_features.drop(
    "Total_Accidents",
    "Avg_Severity"
)

quantile_75 = state_features.approxQuantile("Risk_Score", [0.75], 0.0)[0]

# Create binary label: 1 if Risk_Score > 75th percentile, 0 otherwise
state_features = state_features.withColumn(
    "Is_High_Risk",
    F.when(F.col("Risk_Score") > quantile_75, 1).otherwise(0)
)


feature_cols = [
    "Avg_Visibility", "Avg_Precipitation", "Avg_Temperature",
    "Avg_Accident_Distance", "Num_Unique_Cities"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"  # Skip rows with nulls in feature columns
)

from pyspark.ml.classification import LogisticRegression

model = LogisticRegression(
    featuresCol="features",
    labelCol="Is_High_Risk",
    maxIter=100,
    regParam=0.0,
    elasticNetParam=0.0
)

pipeline = Pipeline(stages=[assembler, model])
train_df, test_df = state_features.randomSplit([0.8, 0.2], seed=42)

model = pipeline.fit(train_df)

predictions = model.transform(test_df)

auc_evaluator = BinaryClassificationEvaluator(
    labelCol="Is_High_Risk",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"
)

metrics = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
evaluators = {
    metric: MulticlassClassificationEvaluator(
        labelCol="Is_High_Risk",
        predictionCol="prediction",
        metricName=metric
    ) for metric in metrics
}

print("Classification Metrics:")
auc = auc_evaluator.evaluate(predictions)
print(f"AUC-ROC: {auc:.4f}")
for metric, evaluator in evaluators.items():
    value = evaluator.evaluate(predictions)
    print(f"{metric.capitalize()}: {value:.4f}")

# Step 10: Extract Feature Importance using Coefficients
coefficients = model.stages[-1].coefficients
importance_pairs = [(feature_cols[i], coefficients[i]) for i in range(len(feature_cols))]
importance_pairs.sort(key=lambda x: abs(x[1]), reverse=True)

print("\nFeature Importance:")
for feature, importance in importance_pairs:
    print(f"{feature}: {importance:.4f}")


Classification Metrics:
AUC-ROC: 0.7778
Accuracy: 0.8182
Weightedprecision: 0.8182
Weightedrecall: 0.8182
F1: 0.8182

Feature Importance:
Avg_Precipitation: -505.8624
Avg_Accident_Distance: -1.0414
Avg_Visibility: -0.7750
Avg_Temperature: 0.2671
Num_Unique_Cities: 0.0154
