# Aggregated methods:
Logistic Regression (LR), Supprt Vector Machine (SVM), XGBoost, and LSTM

In [71]:
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, Input, Model
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam

# Disable TensorFlow v2 behavior
tf.compat.v1.disable_v2_behavior()

# Load and preprocess data
tfrdh_train_path = '../data/Traffic_flow_and_risky_driving_behavior_train.csv'
tfrdh_test_path = '../data/Traffic_flow_and_risky_driving_behavior_test.csv'
tfrdh_train_data = pd.read_csv(tfrdh_train_path)
tfrdh_test_data = pd.read_csv(tfrdh_test_path)

train_X = tfrdh_train_data.drop(['Crash','event_id'], axis=1)
train_y = tfrdh_train_data['Crash']
test_X = tfrdh_test_data.drop(['Crash','event_id'], axis=1)
test_y = tfrdh_test_data['Crash']

# Compute class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(train_y), y=train_y)
class_weights_dict = dict(enumerate(class_weights))

# Train traditional ML models
logistic_model = LogisticRegression(class_weight=class_weights_dict)
svm_model = SVC(class_weight=class_weights_dict, probability=True)
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=class_weights[1])

logistic_model.fit(train_X, train_y)
svm_model.fit(train_X, train_y)
xgb_model.fit(train_X, train_y)

def find_threshold_for_far(y_true, y_scores, target_far):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    closest_far_index = np.argmin(np.abs(fpr - target_far))
    return thresholds[closest_far_index]

def calculate_metrics_with_threshold(model, X, y_true, target_far, is_lstm=False):
    if is_lstm:
        y_scores = model.predict(X).flatten()
    else:
        y_scores = model.predict_proba(X)[:, 1]
        
    auc = roc_auc_score(y_true, y_scores)
    threshold = find_threshold_for_far(y_true, y_scores, target_far)
    y_pred = (y_scores >= threshold).astype(int)

    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    actual_far = fp / (fp + tn)

    return {
        "Threshold": threshold,
        "Accuracy": accuracy,
        "Recall": recall,
        "Actual FAR": actual_far,
        "AUC": auc
    }

# Evaluate traditional ML models
metrics_logistic = calculate_metrics_with_threshold(logistic_model, test_X, test_y, 0.2)
metrics_svm = calculate_metrics_with_threshold(svm_model, test_X, test_y, 0.2)
metrics_xgb = calculate_metrics_with_threshold(xgb_model, test_X, test_y, 0.2)

# Data processing for LSTM model
n_hours = 5
n_features = 18 + 24  # 18 traffic flow variables and 24 risky driving behavior variables
n_obs = n_hours * n_features

train_X = train_X.to_numpy()
test_X = test_X.to_numpy()

train_X = train_X.reshape((-1, n_hours, n_features))
test_X = test_X.reshape((-1, n_hours, n_features))

train_y = train_y.to_numpy()
test_y = test_y.to_numpy()

# Compute class weights for LSTM model
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_y), y=train_y)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

def mortality_loss(y_true, y_pred):
    sample_weights = (1 - y_true) * class_weights_dict[0] + y_true * class_weights_dict[1]
    bce = K.binary_crossentropy(y_true, y_pred)
    return K.mean(sample_weights * bce, axis=-1)

# LSTM model
lstm_input = Input(shape=(train_X.shape[1], train_X.shape[2]))
lstm_out = layers.LSTM(256, return_sequences=False)(lstm_input)
dropout = layers.Dropout(0.5)(lstm_out)
lstm_final = layers.Dense(1, activation='sigmoid')(dropout)
lstm_model = Model(lstm_input, lstm_final)

optimizer = Adam(learning_rate=0.0001)
lstm_model.compile(optimizer=optimizer, loss=mortality_loss, metrics=[tf.keras.metrics.AUC(name='auc')])

lstm_model.fit(train_X, train_y, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate LSTM model with custom threshold
metrics_lstm = calculate_metrics_with_threshold(lstm_model, test_X, test_y, 0.2, is_lstm=True)

# Combine results into a DataFrame
results = pd.DataFrame({
    "Model": ["Logistic Regression", "SVM", "XGBoost", "LSTM"],
    "Threshold": [metrics_logistic["Threshold"], metrics_svm["Threshold"], metrics_xgb["Threshold"], metrics_lstm["Threshold"]],
    "Accuracy": [metrics_logistic["Accuracy"], metrics_svm["Accuracy"],  metrics_xgb["Accuracy"], metrics_lstm["Accuracy"]],
    "Recall": [metrics_logistic["Recall"], metrics_svm["Recall"],  metrics_xgb["Recall"], metrics_lstm["Recall"]],
    "Actual FAR": [metrics_logistic["Actual FAR"], metrics_svm["Actual FAR"],  metrics_xgb["Actual FAR"], metrics_lstm["Actual FAR"]],
    "AUC": [metrics_logistic["AUC"], metrics_svm["AUC"], metrics_xgb["AUC"], metrics_lstm["AUC"]]
})

# Print the results
print(results)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train on 686 samples, validate on 172 samples
Epoch 1/20

  updates = self.state_updates


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


  updates=self.state_updates,


                 Model  Threshold  Accuracy    Recall  Actual FAR       AUC
0  Logistic Regression   0.571272  0.720109  0.421053    0.202055  0.642394
1                  SVM   0.202664  0.752717  0.565789    0.198630  0.741348
2              XGBoost   0.053165  0.747283  0.605263    0.215753  0.764014
3                 LSTM   0.619676  0.744565  0.539474    0.202055  0.742340
