# Importing necessary libraries

In [2]:
!pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
    --------------------------------------- 0.0/1.4 MB 435.7 kB/s eta 0:00:04
   - -------------------------------------- 0.1/1.4 MB 656.4 kB/s eta 0:00:03
   --- ------------------------------------ 0.1/1.4 MB 798.9 kB/s eta 0:00:02
   ----- ---------------------------------- 0.2/1.4 MB 1.2 MB/s eta 0:00:02
   -------- ------------------------------- 0.3/1.4 MB 1.3 MB/s eta 0:00:01
   ----------- ---------------------------- 0.4/1.4 MB 1.4 MB/s eta 0:00:01
   -------------- ------------------------- 0.5/1.4 MB 1.6 MB/s eta 0:00:01
   ----------------- ---------------------- 0.6/1.4 MB 1.6 MB/s eta 0:00:01
   -------------------- ------------------- 0.7/1.4 MB 1.8 MB/s eta 0:00:01
   ---------------------

In [1]:
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

In [2]:
# Load the encoded train and test data
train_data = joblib.load('encoded_train_data.joblib', mmap_mode='r')
test_data = joblib.load('encoded_test_data.joblib', mmap_mode='r')

# Separating features (X) and target variable (y)
X = train_data.drop(columns=['IncidentGrade'])
y = train_data['IncidentGrade']

# Check if target variable is continuous or categorical
if y.dtype.kind in 'iuf':  # Numeric check
    # Define bins if you want to classify continuous values
    num_classes = 3  # Example: 3 categories, adjust as needed
    try:
        y_binned = pd.cut(y, bins=num_classes, labels=False)  # Binning for classification
        is_classification = True
        y = y_binned
    except Exception as e:
        print("Could not bin continuous variable. Treating as regression target.")
        is_classification = False
else:
    is_classification = True  # Categorical target

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if is_classification else None)

# Comparing Machine Learning Models

In [9]:
# Sample for quick training
X_train_subsample = X_train.sample(frac=0.1, random_state=42)
y_train_subsample = y_train.loc[X_train_subsample.index]

# Define models based on classification or regression
if is_classification:
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Random Forest Classifier': RandomForestClassifier(n_jobs=-1, random_state=42),
        'Decision Tree Classifier': DecisionTreeClassifier(random_state=42),
        'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42),
        'XGBoost Classifier': XGBClassifier(n_jobs=-1, random_state=42),
        'LightGBM Classifier': LGBMClassifier(n_jobs=-1, random_state=42),
    }
else:
    models = {
        'Random Forest Regressor': RandomForestRegressor(n_jobs=-1, random_state=42),
        'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
        'XGBoost Regressor': XGBRegressor(n_jobs=-1, random_state=42),
        'LightGBM Regressor': LGBMRegressor(n_jobs=-1, random_state=42),
    }

# Train and evaluate models
for model_name, model in models.items():
    print(f'Model: {model_name}')
    
    model.fit(X_train_subsample, y_train_subsample)
    y_pred = model.predict(X_val)
    
    if is_classification:
        # Classification evaluation
        accuracy = accuracy_score(y_val, y_pred)
        report = classification_report(y_val, y_pred)
        cm = confusion_matrix(y_val, y_pred)
        print(f'Accuracy: {accuracy}')
        print('Classification Report:')
        print(report)
        print('Confusion Matrix:')
        print(cm)
    else:
        # Regression evaluation
        mse = mean_squared_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        print(f'Mean Squared Error: {mse}')
        print(f'R^2 Score: {r2}')
    
    print('-' * 50)


Model: Logistic Regression
Accuracy: 0.6460084900128643
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.92      0.73    775107
           1       0.70      0.15      0.25    390976
           2       0.75      0.62      0.67    628025

    accuracy                           0.65   1794108
   macro avg       0.68      0.56      0.55   1794108
weighted avg       0.67      0.65      0.60   1794108

Confusion Matrix:
[[713727  14564  46816]
 [247262  58590  85124]
 [231163  10170 386692]]
--------------------------------------------------
Model: Random Forest Classifier
Accuracy: 0.7092566333799303
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.85      0.76    775107
           1       0.67      0.45      0.54    390976
           2       0.77      0.70      0.73    628025

    accuracy                           0.71   1794108
   macro avg       0.71      0.67      0.6

In [11]:
# Creating a report data
report = {
    'Model': ['Logistic Regression', 'Random Forest', 'Decision Tree', 'Gradient Boosting', 'XGBoost', 'LightGBM'],
    'Accuracy': [0.6460, 0.7093, 0.7083, 0.6533, 0.6860, 0.6849],
    'Macro-F1 Score': [0.55, 0.68, 0.68, 0.56, 0.62, 0.62],
    'Precision': [0.67, 0.71, 0.71, 0.70, 0.73, 0.72],
    'Recall': [0.65, 0.71, 0.71, 0.65, 0.69, 0.68]
}

df = pd.DataFrame(report)

print("Comparison Table:")
print(df.to_string(index=False))

best_models_with_max_f1 = df[df['Macro-F1 Score'] == df['Macro-F1 Score'].max()]

if len(best_models_with_max_f1) > 1:
    best_model = best_models_with_max_f1.loc[best_models_with_max_f1['Accuracy'].idxmax()]
else:
    best_model = df.loc[df['Macro-F1 Score'].idxmax()]

print("\nBest Model Based on Macro-F1 Score (and Accuracy in case of a tie):")
print(best_model)

Comparison Table:
              Model  Accuracy  Macro-F1 Score  Precision  Recall
Logistic Regression    0.6460            0.55       0.67    0.65
      Random Forest    0.7093            0.68       0.71    0.71
      Decision Tree    0.7083            0.68       0.71    0.71
  Gradient Boosting    0.6533            0.56       0.70    0.65
            XGBoost    0.6860            0.62       0.73    0.69
           LightGBM    0.6849            0.62       0.72    0.68

Best Model Based on Macro-F1 Score (and Accuracy in case of a tie):
Model             Random Forest
Accuracy                 0.7093
Macro-F1 Score             0.68
Precision                  0.71
Recall                     0.71
Name: 1, dtype: object


# Applying SMOTE to the training data for class imbalance and doing hyperparameter tuning for best result

In [None]:
# Loading the encoded train data
train_data = joblib.load('encoded_train_data.joblib', mmap_mode='r')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade'].copy()  # Make a copy of y to avoid the writeable issue

# If the target variable (y) is continuous, you can bin it into categories
# For example, creating 3 bins for a "low", "medium", "high" classification problem
y_binned = pd.cut(y, bins=3, labels=["low", "medium", "high"])

# Converting to numeric and handling NaN values
X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna(axis=1)

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y_binned, test_size=0.2, random_state=42, stratify=y_binned)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

# If there are boolean columns, convert them to integers
if X_train_sampled.select_dtypes(include=['bool']).shape[1] > 0:
    X_train_sampled = X_train_sampled.astype(int)

# Applying SMOTE for multi-class classification (default strategy balances all classes equally)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_sampled, y_train_sampled)

# Hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5,
                                   cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fitting the Randomized Search with resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model
best_rf = random_search.best_estimator_

# Evaluating on validation data
y_pred = best_rf.predict(X_val)

# Printing the results
print("Best Hyperparameters:", random_search.best_params_)
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Saving the tuned model
joblib.dump(best_rf, "rf_smote_tuned_model.joblib")
print("Model saved as rf_smote_tuned_model.joblib")



Fitting 3 folds for each of 5 candidates, totalling 15 fits
