In [None]:
!pip install findspark
!pip install pyspark
!apt-get install -qq openjdk-17-jdk-headless
from google.colab import drive

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Selecting previously unselected package openjdk-17-jre-headless:amd64.
(Reading database ... 126101 files and directories currently installed.)
Preparing to unpack .../openjdk-17-jre-headless_17.0.14+7-1~22.04.1_amd64.deb ...
Unpacking openjdk-17-jre-headless:amd64 (17.0.14+7-1~22.04.1) ...
Selecting previously unselected package openjdk-17-jdk-headless:amd64.
Preparing to unpack .../openjdk-17-jdk-headless_17.0.14+7-1~22.04.1_amd64.deb ...
Unpacking openjdk-17-jdk-headless:amd64 (17.0.14+7-1~22.04.1) ...
Setting up openjdk-17-jre-headless:amd64 (17.0.14+7-1~22.04.1) ...
update-alternatives: using /usr/lib/jvm/java-17-openjdk-amd64/bin/java to provide /usr/bin/java (java) in auto mode
update-alternatives: using /usr/lib/jvm/java-17-openjdk-amd64/bin/jpackage 

In [None]:
drive.flush_and_unmount()
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
import findspark, os
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("XGBoost") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()

In [None]:
train = spark.read.parquet("./drive/MyDrive/dataset/train_selected_tree.parquet")
test = spark.read.parquet("./drive/MyDrive/dataset/test_selected_tree.parquet")

In [None]:
train.printSchema()

root
 |-- Casualty_Severity_ind: double (nullable = true)
 |-- Casualty_Type_ind: double (nullable = true)
 |-- Vehicle_Manoeuvre_ind: double (nullable = true)
 |-- Number_of_Casualties_ind: double (nullable = true)
 |-- Speed_limit_ind: double (nullable = true)
 |-- Urban_or_Rural_Area_ind: double (nullable = true)
 |-- Did_Police_Officer_Attend_Scene_of_Accident_ind: double (nullable = true)
 |-- Junction_Detail_ind: double (nullable = true)
 |-- Vehicle_Leaving_Carriageway_ind: double (nullable = true)
 |-- Junction_Location_ind: double (nullable = true)
 |-- Vehicle_Type_ind: double (nullable = true)
 |-- Junction_Control_ind: double (nullable = true)
 |-- 1st_Point_of_Impact_ind: double (nullable = true)
 |-- Number_of_Vehicles_ind: double (nullable = true)
 |-- Light_Conditions_ind: double (nullable = true)
 |-- Hit_Object_off_Carriageway_ind: double (nullable = true)
 |-- Accident_Severity_ind: double (nullable = true)
 |-- classWeight: double (nullable = true)



In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
import xgboost as xgb
from xgboost.spark import SparkXGBClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
feature_cols = [c for c in train.columns if c not in ["Accident_Severity_ind", "classWeight"]]

In [None]:
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

In [None]:
xgb_classifier = SparkXGBClassifier(
    features_col="features",
    label_col="Accident_Severity_ind",
    weight_col="classWeight",
    #num_workers=4,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    reg_alpha=0,
    reg_lambda=1,
    num_class=3,
    random_state=42,
    eval_metric="mlogloss"
)

In [None]:
pipeline = Pipeline(stages=[assembler, xgb_classifier])

In [None]:
xgb_model = pipeline.fit(train)

INFO:XGBoost-PySpark:Running xgboost-2.1.4 on 1 workers with
	booster params: {'objective': 'multi:softprob', 'colsample_bytree': 0.8, 'device': 'cpu', 'eval_metric': 'mlogloss', 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 3, 'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8, 'num_class': 3, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!


In [None]:
predictions = xgb_model.transform(test)

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="Accident_Severity_ind",
    predictionCol="prediction"
)

In [None]:
metrics = {
    "F1-score": "f1",
    "Accuracy": "accuracy",
    "Weighted Precision": "weightedPrecision",
    "Weighted Recall": "weightedRecall"
}

In [None]:
print("metrics:")
for name, metric in metrics.items():
    score = evaluator.setMetricName(metric).evaluate(predictions)
    print(f"{name}: {score:.4f}")

metrics:
F1-score: 0.9002
Accuracy: 0.8805
Weighted Precision: 0.9293
Weighted Recall: 0.8805


In [None]:
#Confusion Matrix
conf_matrix = predictions.crosstab("Accident_Severity_ind", "prediction")
print("Confusion Matrix:")
conf_matrix.show()

Confusion Matrix:
+--------------------------------+------+-----+----+
|Accident_Severity_ind_prediction|   0.0|  1.0| 2.0|
+--------------------------------+------+-----+----+
|                             1.0|  3353|15736|3713|
|                             0.0|126164| 4363|7562|
|                             2.0|   228|  366|2455|
+--------------------------------+------+-----+----+



In [None]:
#Matthews Correlation Coefficient (MCC): descr -> des tree

In [None]:
from pyspark.sql.functions import col
import math

# -> float
columns = conf_matrix.columns
label_column = columns[0]
class_labels = [float(c) for c in columns[1:]]  # 0.0, 1.0, 2.0

# rows: (true_label, pred_label, count)
matrix_entries = []
for row in conf_matrix.collect():
    true_label = float(row[label_column])
    for pred_label in class_labels:
        count = int(row[str(pred_label)])
        matrix_entries.append((true_label, pred_label, count))


In [None]:
# c - total num cor
# p_k - predicted as class k
# t_k - orig class k
# s - total num

from collections import defaultdict

c = 0
p_k = defaultdict(int)
t_k = defaultdict(int)
s = 0

for true_label, pred_label, count in matrix_entries:
    c += count
    p_k[pred_label] += count
    t_k[true_label] += count
    if true_label == pred_label:
        s += count


In [None]:
sum_pk2 = sum(v**2 for v in p_k.values())
sum_tk2 = sum(v**2 for v in t_k.values())

numerator = c * s - sum(p_k[k] * t_k[k] for k in class_labels)
denominator = math.sqrt((c**2 - sum_pk2) * (c**2 - sum_tk2))

mcc = numerator / denominator if denominator != 0 else 0.0
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")


Matthews Correlation Coefficient (MCC): 0.6324
