In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import matthews_corrcoef, make_scorer
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("transformed_dataset.csv")

In [None]:
df.head()

In [None]:
# k-NN, SMOTE, mcc scorer

import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import matthews_corrcoef, make_scorer



X = df.drop(columns=['Name', 'Date', 'Latitude', 'Longitude', 'Severity_Class'])
y = df['Severity_Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Determine the appropriate number of nearest neighbors for SMOTE
k_neighbors = min(min(y_train.value_counts()) - 1, 5)

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

class_counts = y_train_smote.value_counts()
print("Class counts after SMOTE: ", class_counts)


k_classifier = KNeighborsClassifier()

# Grid search parameters

n_neighbors = np.arange(1,20)
weights = ['uniform', 'distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']

param_grid = {'n_neighbors': n_neighbors, 'weights' : weights, 'algorithm' : algorithm}



cv = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(k_classifier, param_grid, cv=cv, scoring=make_scorer(matthews_corrcoef), verbose=1, n_jobs=-1)

grid_search.fit(X_train_smote, y_train_smote)

print("Best model parameters:", grid_search.best_params_)
print("Training MCC:", grid_search.best_score_)


y_pred = grid_search.best_estimator_.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')



y_prob = grid_search.best_estimator_.predict_proba(X_test)
auc_roc = roc_auc_score(y_test, y_prob, average='macro', multi_class='ovo')

print("Test MCC:", matthews_corrcoef(y_test, y_pred))
print("Test Accuracy:", accuracy)
print("Macro averaged precision:", macro_precision)
print("Macro averaged recall:", macro_recall)
print("Macro averaged F1-score:", macro_f1)
print("AUC-ROC score:", auc_roc)

clf_report = classification_report(y_test, y_pred)
print("Classification report:\n", clf_report)

In [None]:
# Plotting the confusion matrix using Seaborn

class_labels = np.sort(y.unique())

plt.figure(figsize=(10, 7))
sns.set(font_scale=1.4)  
sns.heatmap(conf_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False, xticklabels=class_labels, yticklabels=class_labels)

plt.xlabel('Predicted Label', fontsize=18)
plt.ylabel('True Label', fontsize=18)
plt.title('Confusion Matrix for k-NN Model', fontsize=20)
plt.show()

In [None]:
# Random Forest, SMOTE, mcc scorer

import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import matthews_corrcoef, make_scorer


X = df.drop(columns=['Name', 'Date', 'Latitude', 'Longitude', 'Severity_Class'])
y = df['Severity_Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Determine the appropriate number of nearest neighbors for SMOTE
k_neighbors = min(min(y_train.value_counts()) - 1, 5)

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


rf_classifier = RandomForestClassifier(random_state=42)

criterion = ['gini','entropy','log_loss']
n_estimators = [100,200,300,500,1000]
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
min_samples_leaf = [1, 2, 4]
bootstrap = [True]

param_grid = {'criterion': criterion, 'n_estimators' : n_estimators, 'min_samples_leaf' : min_samples_leaf, 
              'max_depth': max_depth, 'bootstrap': bootstrap}



cv = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(rf_classifier, param_grid, cv=cv, scoring=make_scorer(matthews_corrcoef), verbose=1, n_jobs=-1)

grid_search.fit(X_train_smote, y_train_smote)


print("Best model parameters:", grid_search.best_params_)
print("Training MCC:", grid_search.best_score_)

y_pred = grid_search.best_estimator_.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')


y_prob = grid_search.best_estimator_.predict_proba(X_test)
auc_roc = roc_auc_score(y_test, y_prob, average='macro', multi_class='ovo')


print("Test MCC:", matthews_corrcoef(y_test, y_pred))
print("Test Accuracy:", accuracy)
print("Macro averaged precision:", macro_precision)
print("Macro averaged recall:", macro_recall)
print("Macro averaged F1-score:", macro_f1)
print("AUC-ROC score:", auc_roc)

clf_report = classification_report(y_test, y_pred)
print("Classification report:\n", clf_report)

In [None]:
# Plotting the confusion matrix using Seaborn

class_labels = np.sort(y.unique())

plt.figure(figsize=(10, 7))
sns.set(font_scale=1.4)  
sns.heatmap(conf_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False, xticklabels=class_labels, yticklabels=class_labels)

plt.xlabel('Predicted Label', fontsize=18)
plt.ylabel('True Label', fontsize=18)
plt.title('Confusion Matrix for Random Forest Model', fontsize=20)
plt.show()

In [None]:
# Multinomial Logistic Regression, SMOTE, mcc scorer

import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import matthews_corrcoef, make_scorer


X = df.drop(columns=['Name', 'Date', 'Latitude', 'Longitude', 'Severity_Class'])
y = df['Severity_Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Determine the appropriate number of nearest neighbors for SMOTE
k_neighbors = min(min(y_train.value_counts()) - 1, 5)

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)



logreg_model = LogisticRegression(multi_class='multinomial', max_iter=5000, random_state=42)


param_grid = {'C': np.logspace(-4, 4, num=9), 
              'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']}


cv = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(logreg_model, param_grid, cv=cv, scoring=make_scorer(matthews_corrcoef), verbose=1, n_jobs=-1)

grid_search.fit(X_train_smote, y_train_smote)


print("Best model parameters:", grid_search.best_params_)
print("Training MCC:", grid_search.best_score_)


y_pred = grid_search.best_estimator_.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')



y_prob = grid_search.best_estimator_.predict_proba(X_test)
auc_roc = roc_auc_score(y_test, y_prob, average='macro', multi_class='ovo')


print("Test MCC:", matthews_corrcoef(y_test, y_pred))
print("Test Accuracy:", accuracy)
print("Macro averaged precision:", macro_precision)
print("Macro averaged recall:", macro_recall)
print("Macro averaged F1-score:", macro_f1)
print("AUC-ROC score:", auc_roc)

clf_report = classification_report(y_test, y_pred)
print("Classification report:\n", clf_report)

In [None]:
# Plotting the confusion matrix using Seaborn

class_labels = np.sort(y.unique())

plt.figure(figsize=(10, 7))
sns.set(font_scale=1.4)  
sns.heatmap(conf_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False, xticklabels=class_labels, yticklabels=class_labels)

plt.xlabel('Predicted Label', fontsize=18)
plt.ylabel('True Label', fontsize=18)
plt.title('Confusion Matrix for Multinomial Logistic Regression Model', fontsize=20)
plt.show()

In [None]:
# SVM with One-Vs-One, SMOTE, mcc scorer

import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_recall_fscore_support, confusion_matrix

X = df.drop(columns=['Name', 'Date', 'Latitude', 'Longitude', 'Severity_Class'])
y = df['Severity_Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Determine the appropriate number of nearest neighbors for SMOTE
k_neighbors = min(min(y_train.value_counts()) - 1, 5)

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


# SVM model with a One vs One scheme
svc = SVC(decision_function_shape='ovo', probability=True)

# Grid search parameters

param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
}


cv = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(svc, param_grid, cv=cv, scoring=make_scorer(matthews_corrcoef), verbose=1, n_jobs=-1)

grid_search.fit(X_train_smote, y_train_smote)

print("Best model parameters:", grid_search.best_params_)
print("Training MCC:", grid_search.best_score_)


y_pred = grid_search.best_estimator_.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')


y_prob = grid_search.best_estimator_.predict_proba(X_test)
auc_roc = roc_auc_score(y_test, y_prob, average='macro', multi_class='ovo')


print("Test MCC:", matthews_corrcoef(y_test, y_pred))
print("Test Accuracy:", accuracy)
print("Macro averaged precision:", macro_precision)
print("Macro averaged recall:", macro_recall)
print("Macro averaged F1-score:", macro_f1)
print("AUC-ROC score:", auc_roc)

clf_report = classification_report(y_test, y_pred)
print("Classification report:\n", clf_report)

In [None]:
# Plotting the confusion matrix using Seaborn

class_labels = np.sort(y.unique())

plt.figure(figsize=(10, 7))
sns.set(font_scale=1.4) 
sns.heatmap(conf_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False, xticklabels=class_labels, yticklabels=class_labels)

plt.xlabel('Predicted Label', fontsize=18)
plt.ylabel('True Label', fontsize=18)
plt.title('Confusion Matrix for SVM Model', fontsize=20)
plt.show()