In [1]:
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

In [2]:
# Loading the encoded train and test data
# Load the data into memory (no mmap_mode)
train_data = joblib.load('encoded_train_data1.joblib')
test_data = joblib.load('encoded_test_data1.joblib')


# Separateing features (X) and target variable (y)
X = train_data.drop(columns=['IncidentGrade'])
y = train_data['IncidentGrade']

# Spliting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

evaluating best model for training data

In [4]:
X_train_subsample = X_train.sample(frac=0.1, random_state=42)
y_train_subsample = y_train.loc[X_train_subsample.index]

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
}

for model_name, model in models.items():
    print(f'Model: {model_name}')
    
    model.fit(X_train_subsample, y_train_subsample)
    
    y_pred = model.predict(X_val)
    
    # Evaluateing the models
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    # Displaying the results of the modles
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    print('Confusion Matrix:')
    print(cm)
    print('-' * 50)

Model: Logistic Regression
Accuracy: 0.5605342599219222
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.86      0.65    775107
           1       0.45      0.07      0.12    390976
           2       0.68      0.49      0.57    628025

    accuracy                           0.56   1794108
   macro avg       0.55      0.48      0.45   1794108
weighted avg       0.56      0.56      0.51   1794108

Confusion Matrix:
[[668844  18253  88010]
 [309003  26986  54987]
 [303518  14678 309829]]
--------------------------------------------------


In [5]:
X_train_subsample = X_train.sample(frac=0.1, random_state=42)
y_train_subsample = y_train.loc[X_train_subsample.index]

models = {
    'Random Forest': RandomForestClassifier(n_jobs=-1, random_state=42),
}

for model_name, model in models.items():
    print(f'Model: {model_name}')
    
    model.fit(X_train_subsample, y_train_subsample)
    
    y_pred = model.predict(X_val)
    
    # Evaluateing the models
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    # Displaying the results of the modles
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    print('Confusion Matrix:')
    print(cm)
    print('-' * 50)

Model: Random Forest
Accuracy: 0.9480332287688367
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95    775107
           1       0.96      0.91      0.93    390976
           2       0.97      0.94      0.96    628025

    accuracy                           0.95   1794108
   macro avg       0.95      0.94      0.95   1794108
weighted avg       0.95      0.95      0.95   1794108

Confusion Matrix:
[[755777   9399   9931]
 [ 28858 353838   8280]
 [ 29758   7008 591259]]
--------------------------------------------------


In [6]:
X_train_subsample = X_train.sample(frac=0.1, random_state=42)
y_train_subsample = y_train.loc[X_train_subsample.index]

models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
}

for model_name, model in models.items():
    print(f'Model: {model_name}')
    
    model.fit(X_train_subsample, y_train_subsample)
    
    y_pred = model.predict(X_val)
    
    # Evaluateing the models
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    # Displaying the results of the modles
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    print('Confusion Matrix:')
    print(cm)
    print('-' * 50)

Model: Decision Tree
Accuracy: 0.9666413616125673
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    775107
           1       0.95      0.95      0.95    390976
           2       0.97      0.97      0.97    628025

    accuracy                           0.97   1794108
   macro avg       0.96      0.96      0.96   1794108
weighted avg       0.97      0.97      0.97   1794108

Confusion Matrix:
[[753103  11861  10143]
 [ 11159 371545   8272]
 [ 10184   8230 609611]]
--------------------------------------------------


In [8]:
X_train_subsample = X_train.sample(frac=0.1, random_state=42)
y_train_subsample = y_train.loc[X_train_subsample.index]

models = {
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
}

for model_name, model in models.items():
    print(f'Model: {model_name}')
    
    model.fit(X_train_subsample, y_train_subsample)
    
    y_pred = model.predict(X_val)
    
    # Evaluateing the models
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    # Displaying the results of the modles
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    print('Confusion Matrix:')
    print(cm)
    print('-' * 50)

Model: Gradient Boosting
Accuracy: 0.8067908955313727
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.96      0.82    775107
           1       0.89      0.61      0.72    390976
           2       0.93      0.75      0.83    628025

    accuracy                           0.81   1794108
   macro avg       0.85      0.77      0.79   1794108
weighted avg       0.83      0.81      0.80   1794108

Confusion Matrix:
[[741473  17446  16188]
 [135891 237773  17312]
 [148152  11649 468224]]
--------------------------------------------------


In [9]:
# Createing a report data
report = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting'],
    'Accuracy': [0.56,0.95,0.97,0.81],
    'Macro-F1 Score': [0.45,0.95,0.96,0.79],
    'Precision': [0.55,0.95,0.96,0.85],
    'Recall': [0.48,0.94,0.96,0.77]
}

df = pd.DataFrame(report)

print("Comparison Table:")
print(df.to_string(index=False))

best_models_with_max_f1 = df[df['Macro-F1 Score'] == df['Macro-F1 Score'].max()]

if len(best_models_with_max_f1) > 1:
    best_model = best_models_with_max_f1.loc[best_models_with_max_f1['Accuracy'].idxmax()]
else:
    best_model = df.loc[df['Macro-F1 Score'].idxmax()]

print("\nBest Model Based on Macro-F1 Score (and Accuracy in case of a tie):")
print(best_model)

Comparison Table:
              Model  Accuracy  Macro-F1 Score  Precision  Recall
Logistic Regression      0.56            0.45       0.55    0.48
      Decision Tree      0.95            0.95       0.95    0.94
      Random Forest      0.97            0.96       0.96    0.96
  Gradient Boosting      0.81            0.79       0.85    0.77

Best Model Based on Macro-F1 Score (and Accuracy in case of a tie):
Model             Random Forest
Accuracy                   0.97
Macro-F1 Score             0.96
Precision                  0.96
Recall                     0.96
Name: 2, dtype: object


Applying SMOTE to the training data for class imbalance and doing hyperparameter tuning for best result

In [10]:
# Loading the encoded train data
train_data = joblib.load('encoded_train_data1.joblib')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade']

X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna(axis=1)

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

if X_train_sampled.select_dtypes(include=['bool']).shape[1] > 0:
    X_train_sampled = X_train_sampled.astype(int)

# Applying SMOTE for multi-class classification (default strategy balances all classes equally)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_sampled, y_train_sampled)

# Hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5,
                                   cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fitting the Randomized Search with resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model
best_rf = random_search.best_estimator_

# Evaluating on validation data
y_pred = best_rf.predict(X_val)

# Printing the results
print("Best Hyperparameters:", random_search.best_params_)
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Saving the tuned model
joblib.dump(best_rf, "rf_smote_tuned_model.joblib")
print("Model saved as rf_smote_tuned_model.joblib")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92    775107
           1       0.89      0.87      0.88    390976
           2       0.95      0.91      0.93    628025

    accuracy                           0.92   1794108
   macro avg       0.91      0.91      0.91   1794108
weighted avg       0.92      0.92      0.92   1794108

Confusion Matrix:
[[730008  25805  19294]
 [ 37331 341529  12116]
 [ 39914  15689 572422]]
Model saved as rf_smote_tuned_model.joblib


In [11]:
# Loading the encoded train data
train_data = joblib.load('encoded_train_data1.joblib')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade']

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

# Random Forest without SMOTE
rf_no_smote = RandomForestClassifier(random_state=42, n_jobs=-1)

# Training the model
rf_no_smote.fit(X_train_sampled, y_train_sampled)
y_pred_no_smote = rf_no_smote.predict(X_val)

print("Classification Report Without SMOTE:")
print(classification_report(y_val, y_pred_no_smote))

print("Confusion Matrix Without SMOTE:")
print(confusion_matrix(y_val, y_pred_no_smote))

import joblib
joblib.dump(rf_no_smote, "rf_no_smote_model.joblib")

Classification Report Without SMOTE:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92    775107
           1       0.94      0.84      0.89    390976
           2       0.95      0.90      0.93    628025

    accuracy                           0.92   1794108
   macro avg       0.92      0.90      0.91   1794108
weighted avg       0.92      0.92      0.92   1794108

Confusion Matrix Without SMOTE:
[[746683  13084  15340]
 [ 50018 329436  11522]
 [ 50547   9253 568225]]


['rf_no_smote_model.joblib']

Evaluation of Best Random Forest Model on Test Data

In [12]:
# Loading the saved Random Forest model
best_rf = joblib.load("rf_smote_tuned_model.joblib")

# Loading the test dataset
test_data = joblib.load('encoded_test_data1.joblib')

# Separateing the features and target from test data
X_test = test_data.drop('IncidentGrade', axis=1)  
y_test = test_data['IncidentGrade']

# Makeing predictions on the test data
y_test_pred = best_rf.predict(X_test)

# Evaluateing the saved model on the test data
print("\nClassification Report on Test Data:")
report = classification_report(y_test, y_test_pred, output_dict=True)
print(classification_report(y_test, y_test_pred))

macro_f1 = report['macro avg']['f1-score']
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']

print("\nMacro-F1 Score: {:.2f}".format(macro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("\nConfusion Matrix on Test Data:")
print(confusion_matrix(y_test, y_test_pred))


Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.92      0.71      0.80   1630942
           1       0.62      0.90      0.73    868897
           2       0.90      0.88      0.89   1422856

    accuracy                           0.81   3922695
   macro avg       0.81      0.83      0.81   3922695
weighted avg       0.84      0.81      0.82   3922695


Macro-F1 Score: 0.81
Macro Precision: 0.81
Macro Recall: 0.83

Confusion Matrix on Test Data:
[[1165068  361653  104221]
 [  51374  780323   37200]
 [  56414  116421 1250021]]


Applying SMOTE-ENN to the training data for class imbalance and doing hyperparameter tuning for best result

(SMOTE + Edited Nearest Neighbors)

SMOTE: Adds synthetic samples to balance the classes.

SMOTE-ENN: Adds synthetic samples and then removes noisy or ambiguous samples for better data quality.

In [13]:
# Loading the encoded train data
train_data = joblib.load('encoded_train_data1.joblib')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade']

X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna(axis=1)

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

if X_train_sampled.select_dtypes(include=['bool']).shape[1] > 0:
    X_train_sampled = X_train_sampled.astype(int)

# Applying SMOTE for multi-class classification (default strategy balances all classes equally)
smote_enn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_sampled, y_train_sampled)

# Hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5,
                                   cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fitting the Randomized Search with resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model
best_rf = random_search.best_estimator_

# Evaluating on validation data
y_pred = best_rf.predict(X_val)

# Printing the results
print("Best Hyperparameters:", random_search.best_params_)
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Saving the tuned model
joblib.dump(best_rf, "rf_smote_enn_tuned_model.joblib")
print("Model saved as rf_smote_enn_tuned_model.joblib")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87    775107
           1       0.75      0.85      0.80    390976
           2       0.95      0.81      0.87    628025

    accuracy                           0.85   1794108
   macro avg       0.85      0.85      0.85   1794108
weighted avg       0.86      0.85      0.86   1794108

Confusion Matrix:
[[689352  68705  17050]
 [ 47409 334047   9520]
 [ 77757  41825 508443]]
Model saved as rf_smote_enn_tuned_model.joblib


In [14]:
#finally predicting on test data using 
# Loading the saved Random Forest model
best_rf = joblib.load("rf_smote_tuned_model.joblib")

# Loading the test dataset
test_data = joblib.load('encoded_test_data1.joblib')

# Separateing the features and target from test data
X_test = test_data.drop('IncidentGrade', axis=1)  
y_test = test_data['IncidentGrade']

# Makeing predictions on the test data
y_test_pred = best_rf.predict(X_test)

# Evaluateing the saved model on the test data
print("\nClassification Report on Test Data:")
report = classification_report(y_test, y_test_pred, output_dict=True)
print(classification_report(y_test, y_test_pred))

macro_f1 = report['macro avg']['f1-score']
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']

print("\nMacro-F1 Score: {:.2f}".format(macro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("\nConfusion Matrix on Test Data:")
print(confusion_matrix(y_test, y_test_pred))


Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.92      0.71      0.80   1630942
           1       0.62      0.90      0.73    868897
           2       0.90      0.88      0.89   1422856

    accuracy                           0.81   3922695
   macro avg       0.81      0.83      0.81   3922695
weighted avg       0.84      0.81      0.82   3922695


Macro-F1 Score: 0.81
Macro Precision: 0.81
Macro Recall: 0.83

Confusion Matrix on Test Data:
[[1165068  361653  104221]
 [  51374  780323   37200]
 [  56414  116421 1250021]]
