## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Scaling
from sklearn.preprocessing import MinMaxScaler

 #Correlation Heatmap
from matplotlib.colors import LinearSegmentedColormap

#Statistical Test
from scipy import stats
from sklearn.impute import SimpleImputer

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import Ridge

pd.set_option('display.max_columns', None)

In [3]:
# You can download the data in the source that is linked above the table of contents

# Read in the data
X_train_encoded = pd.read_csv('../project_data/X_train_encoded.csv', delimiter=',', index_col=0)
X_val_encoded = pd.read_csv('../project_data/X_val_encoded.csv', delimiter=',', index_col=0)

y_train = pd.read_csv('../project_data/y_train.csv',delimiter=',', index_col=0)
y_val= pd.read_csv('../project_data/y_val.csv', delimiter=',', index_col=0)

X_test_encoded = pd.read_csv('../project_data/X_test_encoded.csv',index_col=0)

## 1. Separate numerical and categorical

In [4]:
claim_injury_type_mapping = {
    '4. TEMPORARY': 4-1,
    '2. NON-COMP': 2-1,
    '5. PPD SCH LOSS': 5-1,
    '3. MED ONLY': 3-1,
    '6. PPD NSL': 6-1,
    '1. CANCELLED': 1-1,
    '8. DEATH':8-1,
    '7. PTD': 7-1
}

y_train_encoded = y_train['Claim Injury Type'].map(claim_injury_type_mapping)
y_val_encoded = y_val['Claim Injury Type'].map(claim_injury_type_mapping)

> Separate columns in numerical and categorical

In [5]:
num_columns = ['Age at Injury', 'IME-4 Count', 'Number of Dependents',
               'Days_between_Assembly Date_Accident Date_log',
               'Days_between_C-2 Date_Accident Date_log', 'Average Weekly Wage_log',
               'Industry Code_encoded_5. PPD SCH LOSS',
               'Industry Code_encoded_2. NON-COMP',
               'Industry Code_encoded_3. MED ONLY',
               'Industry Code_encoded_4. TEMPORARY',
               'Industry Code_encoded_1. CANCELLED', 'Industry Code_encoded_8. DEATH',
               'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD',
               'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
               'WCIO Cause of Injury Code_encoded_2. NON-COMP',
               'WCIO Cause of Injury Code_encoded_3. MED ONLY',
               'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
               'WCIO Cause of Injury Code_encoded_1. CANCELLED',
               'WCIO Cause of Injury Code_encoded_8. DEATH',
               'WCIO Cause of Injury Code_encoded_6. PPD NSL',
               'WCIO Cause of Injury Code_encoded_7. PTD',
               'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS',
               'WCIO Nature of Injury Code_encoded_2. NON-COMP',
               'WCIO Nature of Injury Code_encoded_3. MED ONLY',
               'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
               'WCIO Nature of Injury Code_encoded_1. CANCELLED',
               'WCIO Nature of Injury Code_encoded_8. DEATH',
               'WCIO Nature of Injury Code_encoded_6. PPD NSL',
               'WCIO Nature of Injury Code_encoded_7. PTD',
               'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
               'WCIO Part Of Body Code_encoded_2. NON-COMP',
               'WCIO Part Of Body Code_encoded_3. MED ONLY',
               'WCIO Part Of Body Code_encoded_4. TEMPORARY',
               'WCIO Part Of Body Code_encoded_1. CANCELLED',
               'WCIO Part Of Body Code_encoded_8. DEATH',
               'WCIO Part Of Body Code_encoded_6. PPD NSL',
               'WCIO Part Of Body Code_encoded_7. PTD']

cat_columns = ['Alternative Dispute Resolution_binary', 
               'COVID-19 Indicator_binary',
               'Attorney/Representative_binary',
               'Carrier Type_Self-insured Private Entity',
               'Carrier Type_Self-insured Public Entity', 
               'Carrier Type_Special Funds',
               'Carrier Type_State Insurance Fund', 
               'C-3 Date_nabinary',
               'First Hearing Date_nabinary', 
               'Accident Date_Season_Spring',
               'Accident Date_Season_Summer', 
               'Accident Date_Season_Winter']


# Create subsets
X_train_num = X_train_encoded[num_columns]
X_train_cat = X_train_encoded[cat_columns]

X_val_num = X_val_encoded[num_columns]
X_val_cat = X_val_encoded[cat_columns]

X_test_num=X_test_encoded[num_columns]
X_test_cat=X_test_encoded[cat_columns]

### 1.1 Smooth

In [6]:
# lambda_smooth = 0.1  # Smoothing parameter
# global_mean = y_train_encoded.mean()  # Global mean of the target variable

# # List of features to smooth
# to_smooth = [
#     'WCIO Part Of Body Code_encoded_8. DEATH',
#     'WCIO Part Of Body Code_encoded_6. PPD NSL',
#     'WCIO Part Of Body Code_encoded_7. PTD',
#     'WCIO Nature of Injury Code_encoded_8. DEATH',
#     'WCIO Nature of Injury Code_encoded_6. PPD NSL',
#     'WCIO Nature of Injury Code_encoded_7. PTD',
#     'WCIO Cause of Injury Code_encoded_8. DEATH',
#     'WCIO Cause of Injury Code_encoded_6. PPD NSL',
#     'WCIO Cause of Injury Code_encoded_7. PTD',
#     'Industry Code_encoded_8. DEATH',
#     'Industry Code_encoded_6. PPD NSL', 
#     'Industry Code_encoded_7. PTD'
# ]

# # Apply smoothing for each column in the to_smooth list
# for column in to_smooth:
#     if column in X_train_num.columns:  # Check if column exists in the DataFrame
#         # Get unique categories in the column
#         unique_categories = X_train_num[column].unique()
        
#         # Apply smoothing for each category
#         for category in unique_categories:
#             # Get the target values for the current category
#             category_target_values = y_train_encoded[X_train_num[column] == category]
            
#             # Compute the category mean (mean target value for this category)
#             category_mean = category_target_values.mean()
            
#             # Number of instances in this category (N_i)
#             N_i = len(category_target_values)
            
#             # Apply the smoothing formula
#             smoothed_value = (N_i * category_mean + lambda_smooth * global_mean) / (N_i + lambda_smooth)
            
#             # Replace the category with the smoothed value using .loc to avoid SettingWithCopyWarning
#             X_train_num.loc[X_train_num[column] == category, column] = smoothed_value
#     else:
#         print(f"Column {column} not found in the DataFrame!")

# # After smoothing, inspect the DataFrame to confirm
# print(X_train_num.head())


Convert y_train dataframe to series

In [7]:
y = y_train.squeeze()  # Converts a DataFrame with one column to a Series

In [8]:
# Percentage distribution of each class in the target variable
class_percentage = y.value_counts(normalize=True) * 100
print(class_percentage)


Claim Injury Type
2. NON-COMP        51.059463
4. TEMPORARY       25.559901
3. MED ONLY        12.014986
5. PPD SCH LOSS     8.460806
1. CANCELLED        2.067928
6. PPD NSL          0.738203
8. DEATH            0.081798
7. PTD              0.016915
Name: proportion, dtype: float64


## 2. Data scaling

### 2.1 Normalization

> Not all models need the variables to be scaled.

In [9]:
scaler = MinMaxScaler().fit(X_train_num)
X_train_num_scaled = scaler.transform(X_train_num)
print("Parameters fitted:")
for feature, min_val, max_val in zip(X_train_num.columns, scaler.data_min_, scaler.data_max_):
    print(f"Variable: {feature} | Min: {min_val} | Max: {max_val}")

# Convert the array to a pandas dataframe
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns = X_train_num.columns).set_index(X_train_encoded.index)
X_train_num_scaled.describe().round(2)

Parameters fitted:
Variable: Age at Injury | Min: 14.0 | Max: 83.0
Variable: IME-4 Count | Min: 0.0 | Max: 20.0
Variable: Number of Dependents | Min: 0.0 | Max: 6.0
Variable: Days_between_Assembly Date_Accident Date_log | Min: 0.6931471805599453 | Max: 7.371489295214277
Variable: Days_between_C-2 Date_Accident Date_log | Min: 0.6931471805599453 | Max: 7.510430556378006
Variable: Average Weekly Wage_log | Min: 0.6931471805599453 | Max: 8.58443493056553
Variable: Industry Code_encoded_5. PPD SCH LOSS | Min: 0.0365412641010417 | Max: 0.2039048872295498
Variable: Industry Code_encoded_2. NON-COMP | Min: 0.3646337528234915 | Max: 0.6544579016489129
Variable: Industry Code_encoded_3. MED ONLY | Min: 0.0909513480012395 | Max: 0.1584569732937685
Variable: Industry Code_encoded_4. TEMPORARY | Min: 0.1535094119363782 | Max: 0.3952427050200525
Variable: Industry Code_encoded_1. CANCELLED | Min: 0.0120874904067536 | Max: 0.1322957198386701
Variable: Industry Code_encoded_8. DEATH | Min: 4.16835290

Unnamed: 0,Age at Injury,IME-4 Count,Number of Dependents,Days_between_Assembly Date_Accident Date_log,Days_between_C-2 Date_Accident Date_log,Average Weekly Wage_log,Industry Code_encoded_5. PPD SCH LOSS,Industry Code_encoded_2. NON-COMP,Industry Code_encoded_3. MED ONLY,Industry Code_encoded_4. TEMPORARY,Industry Code_encoded_1. CANCELLED,Industry Code_encoded_8. DEATH,Industry Code_encoded_6. PPD NSL,Industry Code_encoded_7. PTD,WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS,WCIO Cause of Injury Code_encoded_2. NON-COMP,WCIO Cause of Injury Code_encoded_3. MED ONLY,WCIO Cause of Injury Code_encoded_4. TEMPORARY,WCIO Cause of Injury Code_encoded_1. CANCELLED,WCIO Cause of Injury Code_encoded_8. DEATH,WCIO Cause of Injury Code_encoded_6. PPD NSL,WCIO Cause of Injury Code_encoded_7. PTD,WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS,WCIO Nature of Injury Code_encoded_2. NON-COMP,WCIO Nature of Injury Code_encoded_3. MED ONLY,WCIO Nature of Injury Code_encoded_4. TEMPORARY,WCIO Nature of Injury Code_encoded_1. CANCELLED,WCIO Nature of Injury Code_encoded_8. DEATH,WCIO Nature of Injury Code_encoded_6. PPD NSL,WCIO Nature of Injury Code_encoded_7. PTD,WCIO Part Of Body Code_encoded_5. PPD SCH LOSS,WCIO Part Of Body Code_encoded_2. NON-COMP,WCIO Part Of Body Code_encoded_3. MED ONLY,WCIO Part Of Body Code_encoded_4. TEMPORARY,WCIO Part Of Body Code_encoded_1. CANCELLED,WCIO Part Of Body Code_encoded_8. DEATH,WCIO Part Of Body Code_encoded_6. PPD NSL,WCIO Part Of Body Code_encoded_7. PTD
count,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0
mean,0.41,0.04,0.5,0.3,0.3,0.33,0.29,0.5,0.43,0.42,0.07,0.2,0.55,0.26,0.18,0.43,0.51,0.37,0.04,0.01,0.16,0.01,0.17,0.52,0.58,0.45,0.04,0.01,0.28,0.0,0.23,0.34,0.61,0.54,0.04,0.02,0.15,0.03
std,0.2,0.09,0.33,0.16,0.17,0.4,0.34,0.31,0.2,0.25,0.07,0.14,0.19,0.17,0.13,0.18,0.13,0.13,0.15,0.02,0.11,0.02,0.11,0.19,0.15,0.2,0.15,0.03,0.18,0.02,0.19,0.24,0.15,0.19,0.15,0.07,0.16,0.06
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.0,0.17,0.19,0.18,0.0,0.06,0.17,0.27,0.26,0.03,0.12,0.4,0.15,0.13,0.31,0.47,0.31,0.01,0.0,0.06,0.0,0.06,0.41,0.55,0.34,0.02,0.0,0.09,0.0,0.04,0.14,0.57,0.37,0.01,0.0,0.03,0.0
50%,0.41,0.0,0.5,0.27,0.26,0.0,0.12,0.54,0.45,0.46,0.05,0.15,0.63,0.27,0.17,0.38,0.54,0.38,0.02,0.0,0.16,0.01,0.19,0.53,0.6,0.44,0.02,0.0,0.26,0.0,0.22,0.25,0.61,0.58,0.02,0.0,0.1,0.01
75%,0.58,0.0,0.83,0.38,0.4,0.79,0.33,0.74,0.64,0.55,0.09,0.22,0.67,0.3,0.21,0.52,0.56,0.45,0.02,0.0,0.21,0.01,0.25,0.56,0.66,0.57,0.02,0.0,0.44,0.0,0.31,0.54,0.72,0.68,0.03,0.03,0.24,0.03
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
X_val_num_scaled = scaler.transform(X_val_num)
X_val_num_scaled = pd.DataFrame(X_val_num_scaled, columns = X_val_num.columns).set_index(X_val_encoded.index)
X_val_num_scaled.describe().round(2)

Unnamed: 0,Age at Injury,IME-4 Count,Number of Dependents,Days_between_Assembly Date_Accident Date_log,Days_between_C-2 Date_Accident Date_log,Average Weekly Wage_log,Industry Code_encoded_5. PPD SCH LOSS,Industry Code_encoded_2. NON-COMP,Industry Code_encoded_3. MED ONLY,Industry Code_encoded_4. TEMPORARY,Industry Code_encoded_1. CANCELLED,Industry Code_encoded_8. DEATH,Industry Code_encoded_6. PPD NSL,Industry Code_encoded_7. PTD,WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS,WCIO Cause of Injury Code_encoded_2. NON-COMP,WCIO Cause of Injury Code_encoded_3. MED ONLY,WCIO Cause of Injury Code_encoded_4. TEMPORARY,WCIO Cause of Injury Code_encoded_1. CANCELLED,WCIO Cause of Injury Code_encoded_8. DEATH,WCIO Cause of Injury Code_encoded_6. PPD NSL,WCIO Cause of Injury Code_encoded_7. PTD,WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS,WCIO Nature of Injury Code_encoded_2. NON-COMP,WCIO Nature of Injury Code_encoded_3. MED ONLY,WCIO Nature of Injury Code_encoded_4. TEMPORARY,WCIO Nature of Injury Code_encoded_1. CANCELLED,WCIO Nature of Injury Code_encoded_8. DEATH,WCIO Nature of Injury Code_encoded_6. PPD NSL,WCIO Nature of Injury Code_encoded_7. PTD,WCIO Part Of Body Code_encoded_5. PPD SCH LOSS,WCIO Part Of Body Code_encoded_2. NON-COMP,WCIO Part Of Body Code_encoded_3. MED ONLY,WCIO Part Of Body Code_encoded_4. TEMPORARY,WCIO Part Of Body Code_encoded_1. CANCELLED,WCIO Part Of Body Code_encoded_8. DEATH,WCIO Part Of Body Code_encoded_6. PPD NSL,WCIO Part Of Body Code_encoded_7. PTD
count,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0
mean,0.41,0.04,0.5,0.31,0.3,0.33,0.28,0.5,0.43,0.42,0.07,0.2,0.55,0.26,0.18,0.43,0.5,0.37,0.05,0.01,0.16,0.01,0.17,0.52,0.57,0.45,0.05,0.01,0.28,0.0,0.23,0.34,0.61,0.54,0.05,0.02,0.15,0.03
std,0.2,0.1,0.33,0.18,0.17,0.4,0.34,0.31,0.2,0.25,0.07,0.14,0.19,0.17,0.13,0.18,0.13,0.13,0.16,0.02,0.11,0.02,0.12,0.19,0.15,0.2,0.16,0.03,0.18,0.02,0.19,0.24,0.16,0.19,0.16,0.07,0.16,0.06
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.0,0.17,0.19,0.18,0.0,0.06,0.17,0.27,0.26,0.03,0.12,0.4,0.15,0.13,0.31,0.47,0.31,0.01,0.0,0.05,0.0,0.06,0.41,0.55,0.34,0.02,0.0,0.09,0.0,0.04,0.14,0.55,0.37,0.01,0.0,0.03,0.0
50%,0.41,0.0,0.5,0.27,0.26,0.0,0.12,0.54,0.45,0.46,0.05,0.15,0.63,0.27,0.17,0.38,0.54,0.38,0.02,0.0,0.16,0.01,0.19,0.53,0.6,0.44,0.02,0.0,0.26,0.0,0.22,0.26,0.61,0.58,0.02,0.0,0.1,0.01
75%,0.58,0.0,0.83,0.38,0.41,0.79,0.33,0.74,0.63,0.55,0.09,0.22,0.67,0.3,0.21,0.52,0.56,0.45,0.02,0.0,0.21,0.01,0.25,0.56,0.66,0.57,0.02,0.0,0.44,0.0,0.31,0.54,0.72,0.68,0.03,0.03,0.24,0.03
max,1.0,1.0,1.0,1.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
X_test_num_scaled = scaler.transform(X_test_num)
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns = X_test_num.columns).set_index(X_test_encoded.index)
X_test_num_scaled.describe().round(2)

Unnamed: 0,Age at Injury,IME-4 Count,Number of Dependents,Days_between_Assembly Date_Accident Date_log,Days_between_C-2 Date_Accident Date_log,Average Weekly Wage_log,Industry Code_encoded_5. PPD SCH LOSS,Industry Code_encoded_2. NON-COMP,Industry Code_encoded_3. MED ONLY,Industry Code_encoded_4. TEMPORARY,Industry Code_encoded_1. CANCELLED,Industry Code_encoded_8. DEATH,Industry Code_encoded_6. PPD NSL,Industry Code_encoded_7. PTD,WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS,WCIO Cause of Injury Code_encoded_2. NON-COMP,WCIO Cause of Injury Code_encoded_3. MED ONLY,WCIO Cause of Injury Code_encoded_4. TEMPORARY,WCIO Cause of Injury Code_encoded_1. CANCELLED,WCIO Cause of Injury Code_encoded_8. DEATH,WCIO Cause of Injury Code_encoded_6. PPD NSL,WCIO Cause of Injury Code_encoded_7. PTD,WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS,WCIO Nature of Injury Code_encoded_2. NON-COMP,WCIO Nature of Injury Code_encoded_3. MED ONLY,WCIO Nature of Injury Code_encoded_4. TEMPORARY,WCIO Nature of Injury Code_encoded_1. CANCELLED,WCIO Nature of Injury Code_encoded_8. DEATH,WCIO Nature of Injury Code_encoded_6. PPD NSL,WCIO Nature of Injury Code_encoded_7. PTD,WCIO Part Of Body Code_encoded_5. PPD SCH LOSS,WCIO Part Of Body Code_encoded_2. NON-COMP,WCIO Part Of Body Code_encoded_3. MED ONLY,WCIO Part Of Body Code_encoded_4. TEMPORARY,WCIO Part Of Body Code_encoded_1. CANCELLED,WCIO Part Of Body Code_encoded_8. DEATH,WCIO Part Of Body Code_encoded_6. PPD NSL,WCIO Part Of Body Code_encoded_7. PTD
count,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0
mean,0.4,0.01,0.5,0.29,0.28,0.15,0.26,0.54,0.44,0.4,0.07,0.19,0.54,0.25,0.18,0.44,0.51,0.36,0.04,0.0,0.16,0.01,0.17,0.53,0.58,0.43,0.04,0.01,0.28,0.0,0.23,0.34,0.63,0.54,0.04,0.02,0.15,0.02
std,0.2,0.04,0.33,0.17,0.16,0.32,0.32,0.31,0.19,0.26,0.07,0.13,0.19,0.17,0.12,0.18,0.13,0.14,0.15,0.02,0.11,0.02,0.11,0.19,0.15,0.2,0.15,0.03,0.17,0.02,0.18,0.24,0.15,0.19,0.15,0.05,0.15,0.05
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.23,0.0,0.17,0.16,0.16,0.0,0.06,0.17,0.27,0.26,0.03,0.08,0.4,0.15,0.13,0.31,0.48,0.31,0.01,0.0,0.06,0.0,0.06,0.41,0.56,0.34,0.02,0.0,0.17,0.0,0.05,0.12,0.57,0.37,0.01,0.0,0.03,0.0
50%,0.39,0.0,0.5,0.24,0.25,0.0,0.11,0.62,0.45,0.4,0.05,0.15,0.58,0.22,0.17,0.38,0.54,0.37,0.02,0.0,0.18,0.01,0.19,0.55,0.6,0.44,0.02,0.0,0.26,0.0,0.22,0.25,0.63,0.59,0.02,0.0,0.1,0.01
75%,0.57,0.0,0.83,0.36,0.37,0.0,0.33,0.8,0.64,0.48,0.09,0.22,0.67,0.3,0.2,0.52,0.56,0.45,0.02,0.0,0.21,0.01,0.25,0.56,0.66,0.57,0.02,0.0,0.44,0.0,0.31,0.54,0.72,0.68,0.03,0.03,0.24,0.03
max,1.0,0.9,1.0,1.05,1.02,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### 2.2 Standarization

> We can use both, if we're using algorithms that benefit from using both.  (e.g., neural networks may benefit from normalization, while models like logistic regression may perform better with standardized data).

In [12]:
# from sklearn.preprocessing import StandardScaler

# std_scaler = StandardScaler().fit(X_train_num)
# X_train_scaled_std= scaler.transform(X_train_num)
# X_train_scaled_std = pd.DataFrame(X_train_scaled_std, columns = X_train_num.columns).set_index(X_train_encoded.index)

# X_val_scaled_std = scaler.transform(X_val_num)
# X_val_scaled_std = pd.DataFrame(X_val_scaled_std, columns = X_val_num.columns).set_index(X_val_encoded.index)


In [13]:
# Compute correlation matrix
corr_matrix = X_train_num_scaled.corr().abs()

# Upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Show the correlation matrix as a plot (optional)
# plt.figure(figsize=(15, 10))
# sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
# plt.title("Correlation Matrix")
# plt.show()

# Find features with correlation greater than 0.9
high_corr_features = [column for column in upper.columns if any(upper[column] > 0.8)]

# Extract feature pairs with correlation values
high_corr_pairs = [(col1, col2, upper.loc[col1, col2]) 
                   for col1 in upper.columns 
                   for col2 in upper.index 
                   if upper.loc[col1, col2] > 0.8]

# Print the pairs of highly correlated features and their correlation values
print("Highly correlated feature pairs:")
for col1, col2, corr_value in high_corr_pairs:
    print(f"{col1} and {col2}: Correlation = {corr_value:.2f}")

print(f"\nDrop features due to high correlation: {high_corr_features}")

Highly correlated feature pairs:
Days_between_Assembly Date_Accident Date_log and Days_between_C-2 Date_Accident Date_log: Correlation = 0.92
WCIO Cause of Injury Code_encoded_2. NON-COMP and WCIO Cause of Injury Code_encoded_4. TEMPORARY: Correlation = 0.80
WCIO Cause of Injury Code_encoded_1. CANCELLED and WCIO Nature of Injury Code_encoded_1. CANCELLED: Correlation = 0.99
WCIO Cause of Injury Code_encoded_1. CANCELLED and WCIO Part Of Body Code_encoded_1. CANCELLED: Correlation = 0.99
WCIO Nature of Injury Code_encoded_2. NON-COMP and WCIO Nature of Injury Code_encoded_4. TEMPORARY: Correlation = 0.86
WCIO Nature of Injury Code_encoded_1. CANCELLED and WCIO Part Of Body Code_encoded_1. CANCELLED: Correlation = 0.99

Drop features due to high correlation: ['Days_between_C-2 Date_Accident Date_log', 'WCIO Cause of Injury Code_encoded_4. TEMPORARY', 'WCIO Nature of Injury Code_encoded_4. TEMPORARY', 'WCIO Nature of Injury Code_encoded_1. CANCELLED', 'WCIO Part Of Body Code_encoded_1. C

## 3. Feature Selection

### 3.1 Categorical: Filter Methods

#### 3.1.1 Chi square and Cramer's V

In [14]:
# import pandas as pd
# import numpy as np
# from sklearn.feature_selection import chi2
# from scipy.stats import chi2_contingency

# def cramers_v(X, y):
#     # Create the contingency table
#     df_contingency = pd.crosstab(X, y)
    
#     # Perform chi-square test
#     chi2, p, dof, expected = chi2_contingency(df_contingency.values)
    
#     # Calculate Cramér's V
#     n = df_contingency.sum().sum()  # Total number of observations
#     min_dim = min(df_contingency.shape) - 1  # Min between number of rows and columns - 1
#     cramers_v = np.sqrt(chi2 / (n * min_dim))  # Cramér's V formula
    
#     return cramers_v

# # Example for X_train_cat (categorical features) and y_train (target)
# # Perform Chi-square test
# chi2_values, p_values = chi2(X_train_cat, y)

# # Create a DataFrame with Chi-square results
# chi2_results = pd.DataFrame({
#     'Column': X_train_cat.columns,
#     'Chi2': chi2_values.round(5),
#     'p-value': p_values.round(5)
# })

# # Calculate Cramér's V for each feature
# cramers_v_values = []
# for var in X_train_cat.columns:
#     v = cramers_v(X_train_cat[var], y)
#     cramers_v_values.append(v)

# # Add Cramér's V results to the DataFrame
# chi2_results['Cramér\'s V'] = cramers_v_values

# # Set threshold for Cramér's V (e.g., 0.1 for weak association, 0.3 for moderate, etc.)
# cramers_v_threshold = 0.25

# # Filter features based on p-value < 0.05 (Chi-square) and strong association (Cramér's V)
# important_features = chi2_results[(chi2_results['p-value'] < 0.05) & (chi2_results['Cramér\'s V'] >= cramers_v_threshold)]

# # Features to consider removing (p-value < 0.05 but weak association)
# remove_features = chi2_results[(chi2_results['p-value'] < 0.05) & (chi2_results['Cramér\'s V'] < cramers_v_threshold)]

# # Features with p-value >= 0.05 (Chi-square) can generally be discarded
# not_important_features = chi2_results[chi2_results['p-value'] >= 0.05]

# # Print results
# print("Important Features (p-value < 0.05 & Cramér's V >= 0.25):")
# print(important_features[['Column', 'Chi2', 'p-value', 'Cramér\'s V']])

# print("\nFeatures to Consider Removing (p-value < 0.05 but weak Cramér's V < 0.25):")
# print(remove_features[['Column', 'Chi2', 'p-value', 'Cramér\'s V']])

# print("\nNot Important Features (p-value >= 0.05):")
# print(not_important_features[['Column', 'Chi2', 'p-value']])


#### 3.1.2 Mutual Information

In [15]:
# import pandas as pd
# from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
# from sklearn.preprocessing import LabelEncoder

# # Assuming X_train_cat is your categorical feature set and y_train is the target
# # For classification problems:
# mi = mutual_info_classif(X_train_cat, y,discrete_features=True)

# # Create a DataFrame with the results
# mi_results = pd.DataFrame({
#     'Feature': X_train_cat.columns,
#     'Mutual Information': mi
# })

# # Sort the features by their mutual information score
# mi_results = mi_results.sort_values(by='Mutual Information', ascending=False)

# # Display the top features based on MI
# print("Mutual Information Results:")
# mi_results


> For the categorical we should look into keeping with:
1. Chi test: p-value < 0.05
2. Cramer's V > 0.1
3. Mutual information: above 0.1

#### 3.1.3 Drop categorical

In [16]:
# List of features to drop from X_train_cat
cat_features_to_drop = [
    'COVID-19 Indicator_binary',
    'Alternative Dispute Resolution_binary',
    'Carrier Type_Self-insured Private Entity',
    'Accident Date_Season_Winter',
    'Carrier Type_Special Funds',
    'Accident Date_Season_Spring',
    'Accident Date_Season_Summer'
]

# Drop features
X_train_cat_filtered = X_train_cat.drop(columns=cat_features_to_drop)
X_val_cat_filtered = X_val_cat.drop(columns=cat_features_to_drop)
X_test_cat_filtered = X_test_cat.drop(columns=cat_features_to_drop)

# Verify the remaining columns
print("Remaining features in X_train_cat:", X_train_cat_filtered.columns.tolist())


Remaining features in X_train_cat: ['Attorney/Representative_binary', 'Carrier Type_Self-insured Public Entity', 'Carrier Type_State Insurance Fund', 'C-3 Date_nabinary', 'First Hearing Date_nabinary']


### 3.2 Numerical: Filter Methods

#### 3.2.1 Univariate

In [17]:
# X_train_num_scaled.var().sort_values(ascending=False)

#### 3.2.2 Spearman Correlation

Spearman correlation because nonlinear and encoded

In [18]:
# # Flatten the correlation matrix and reset the index
# correlation_pairs = cor_spearman.unstack().reset_index()

# # Rename columns for clarity
# correlation_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']

# # Filter the table for correlations > 0.5 or < -0.5 and exclude self-correlations (diagonal)
# strong_correlations = correlation_pairs[
#     ((correlation_pairs['Correlation'] > 0.8) | (correlation_pairs['Correlation'] < -0.8)) &
#     (correlation_pairs['Feature_1'] != correlation_pairs['Feature_2'])
# ]

# # Drop duplicate pairs (keep unique pairs only)
# strong_correlations = strong_correlations.sort_values(by='Correlation', ascending=False).drop_duplicates(subset=['Correlation'])

# # Display the table
# strong_correlations


#### 3.2.3 Mutual Information

In [19]:
# from sklearn.feature_selection import mutual_info_classif

# # Compute mutual information between categorical features and target
# mutual_info = mutual_info_classif(X_train_num_scaled, y_train_encoded)

# # Display features sorted by mutual information
# mi_results = pd.DataFrame({
#     'Feature': X_train_num_scaled.columns,
#     'Mutual Information': mutual_info
# }).sort_values(by='Mutual Information', ascending=False)

# mi_results

#### 3.2.4 Drop numerical

In [20]:
# List of features to drop from X_train_num_scaled
num_features_to_drop = [
    'Industry Code_encoded_1. CANCELLED',
    'Days_between_C-2 Date_Accident Date_log'
]

# Drop features
X_train_num_scaled_filtered = X_train_num_scaled.drop(columns=num_features_to_drop)
X_val_num_scaled_filtered = X_val_num_scaled.drop(columns=num_features_to_drop)
X_test_num_scaled_filtered = X_test_num_scaled.drop(columns=num_features_to_drop)

# Verify the remaining columns
print("Remaining features in X_train_num_scaled:", X_train_num_scaled_filtered.columns.tolist())


Remaining features in X_train_num_scaled: ['Age at Injury', 'IME-4 Count', 'Number of Dependents', 'Days_between_Assembly Date_Accident Date_log', 'Average Weekly Wage_log', 'Industry Code_encoded_5. PPD SCH LOSS', 'Industry Code_encoded_2. NON-COMP', 'Industry Code_encoded_3. MED ONLY', 'Industry Code_encoded_4. TEMPORARY', 'Industry Code_encoded_8. DEATH', 'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD', 'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Cause of Injury Code_encoded_2. NON-COMP', 'WCIO Cause of Injury Code_encoded_3. MED ONLY', 'WCIO Cause of Injury Code_encoded_4. TEMPORARY', 'WCIO Cause of Injury Code_encoded_1. CANCELLED', 'WCIO Cause of Injury Code_encoded_8. DEATH', 'WCIO Cause of Injury Code_encoded_6. PPD NSL', 'WCIO Cause of Injury Code_encoded_7. PTD', 'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Nature of Injury Code_encoded_2. NON-COMP', 'WCIO Nature of Injury Code_encoded_3. MED ONLY', 'WCIO Nature of Injury Code_

### 3.3 Combine the filtered datasets

In [21]:
# Combine the filtered datasets
X_train = pd.concat([X_train_cat_filtered, X_train_num_scaled_filtered], axis=1)
X_val =  pd.concat([X_val_cat_filtered, X_val_num_scaled_filtered], axis=1)
X_test =  pd.concat([X_test_cat_filtered, X_test_num_scaled_filtered], axis=1)


# Verify the shape of the combined dataset
print("Shape of combined X_train:", X_train.shape)


Shape of combined X_train: (396097, 41)


### 3.4 Random forest, LASSO, Ridge

#### 3.4.1 Random Forest

In [22]:
# from sklearn.ensemble import RandomForestClassifier

# # Initialize the Random Forest Classifier
# rf_model = RandomForestClassifier(n_estimators=100, random_state=47)

# # Fit the model on X_train and y_train_encoded
# rf_model.fit(X_train, y_train_encoded)

# # Get feature importances
# importances = rf_model.feature_importances_

# # Sort feature importances in descending order
# important_indices = importances.argsort()[::-1]

# # Print the feature importances
# print("Feature importances:", importances)
# print("Sorted feature indices:", important_indices)


In [28]:
from xgboost import XGBClassifier
import numpy as np

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, random_state=47, use_label_encoder=False, eval_metric="mlogloss")

# Fit the model on X_train and y_train_encoded
xgb_model.fit(X_train, y_train_encoded)

# Get feature importances
importances = xgb_model.feature_importances_

# Sort feature importances in descending order
important_indices = np.argsort(importances)[::-1]

# Print the feature importances
print("Feature importances:", importances)
print("Sorted feature indices:", important_indices)


Parameters: { "use_label_encoder" } are not used.



Feature importances: [0.2123499  0.02225818 0.01692025 0.01232965 0.03693723 0.00355544
 0.03224836 0.00229114 0.00607937 0.3457392  0.02005954 0.00609332
 0.00565921 0.00678314 0.00461601 0.00556071 0.0033216  0.00885369
 0.00374831 0.03942526 0.0041587  0.01120206 0.00347897 0.0037326
 0.00317663 0.00998864 0.0115872  0.01805486 0.01573456 0.02272039
 0.0051197  0.00396704 0.00335895 0.04435305 0.00424112 0.01452704
 0.00512663 0.00407384 0.0061291  0.00763775 0.00280163]
Sorted feature indices: [ 9  0 33 19  4  6 29  1 10 27  2 28 35  3 26 21 25 17 39 13 38 11  8 12
 15 36 30 14 34 20 37 31 18 23  5 22 32 16 24 40  7]


In [29]:
import pandas as pd
from xgboost import XGBClassifier

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, random_state=47, use_label_encoder=False, eval_metric="mlogloss")

# Fit the model on X_train and y_train_encoded
xgb_model.fit(X_train, y_train_encoded)

# Get feature importances
importances = xgb_model.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,  # Assumes X_train is a pandas DataFrame
    'Importance': importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the sorted feature names and their importance
print(feature_importance_df)


Parameters: { "use_label_encoder" } are not used.



                                              Feature  Importance
9                             Average Weekly Wage_log    0.345739
0                      Attorney/Representative_binary    0.212350
33     WCIO Part Of Body Code_encoded_5. PPD SCH LOSS    0.044353
19      WCIO Cause of Injury Code_encoded_3. MED ONLY    0.039425
4                         First Hearing Date_nabinary    0.036937
6                                         IME-4 Count    0.032248
29    WCIO Nature of Injury Code_encoded_1. CANCELLED    0.022720
1             Carrier Type_Self-insured Public Entity    0.022258
10              Industry Code_encoded_5. PPD SCH LOSS    0.020060
27     WCIO Nature of Injury Code_encoded_3. MED ONLY    0.018055
2                   Carrier Type_State Insurance Fund    0.016920
28    WCIO Nature of Injury Code_encoded_4. TEMPORARY    0.015735
35         WCIO Part Of Body Code_encoded_3. MED ONLY    0.014527
3                                   C-3 Date_nabinary    0.012330
26     WCI

In [23]:
# import numpy as np

# # Calculate cumulative sum of feature importances
# cumulative_importance = np.cumsum(importances[important_indices])

# # Find the index where cumulative importance exceeds 90%
# threshold = 0.90
# index_90 = np.argmax(cumulative_importance >= threshold)

# # The number of features that explain 90% of the importance
# num_features_90 = index_90 + 1  # Adding 1 since index starts at 0

# print(f"Number of features explaining 90% of cumulative importance: {num_features_90}")

# # Select the top features based on this number
# random_forest_features = X_train.columns[important_indices[:num_features_90]]
# print(f"Selected features that explain 90% of cumulative importance: {random_forest_features}")


In [30]:
import numpy as np
import pandas as pd

# Calculate cumulative sum of feature importances
cumulative_importance = np.cumsum(feature_importance_df['Importance'])

# Find the index where cumulative importance exceeds 90%
threshold = 0.90
index_90 = np.argmax(cumulative_importance >= threshold)

# The number of features that explain 90% of the importance
num_features_90 = index_90 + 1  # Adding 1 since index starts at 0

print(f"Number of features explaining 90% of cumulative importance: {num_features_90}")

# Select the top features based on this number
xgboost_top_features = feature_importance_df['Feature'].iloc[:num_features_90]
print(f"Selected features that explain 90% of cumulative importance:\n{xgboost_top_features}")


Number of features explaining 90% of cumulative importance: 19
Selected features that explain 90% of cumulative importance:
9                               Average Weekly Wage_log
0                        Attorney/Representative_binary
33       WCIO Part Of Body Code_encoded_5. PPD SCH LOSS
19        WCIO Cause of Injury Code_encoded_3. MED ONLY
4                           First Hearing Date_nabinary
6                                           IME-4 Count
29      WCIO Nature of Injury Code_encoded_1. CANCELLED
1               Carrier Type_Self-insured Public Entity
10                Industry Code_encoded_5. PPD SCH LOSS
27       WCIO Nature of Injury Code_encoded_3. MED ONLY
2                     Carrier Type_State Insurance Fund
28      WCIO Nature of Injury Code_encoded_4. TEMPORARY
35           WCIO Part Of Body Code_encoded_3. MED ONLY
3                                     C-3 Date_nabinary
26       WCIO Nature of Injury Code_encoded_2. NON-COMP
21       WCIO Cause of Injury Code_e

#### 3.4.2 Lasso

In [24]:
# from sklearn.linear_model import LassoCV
# lasso = LassoCV(cv=5)  # Cross-validation to select optimal alpha
# lasso.fit(X_train, y_train_encoded)
# lasso_features = X_train.columns[lasso.coef_ != 0]
# print("Selected features by Lasso:", lasso_features)

#### 3.4.3 Ridge

In [25]:
# from sklearn.linear_model import RidgeCV
# import numpy as np

# # Initialize Ridge regression model with cross-validation
# ridge_model = RidgeCV(cv=5)

# # Fit the model on the training data
# ridge_model.fit(X_train, y_train_encoded)

# # Get the feature coefficients
# coefficients = ridge_model.coef_

# # Set a threshold to select features (e.g., absolute coefficient > 0.01)
# threshold = 0.001
# ridge_features = X_train.columns[np.abs(coefficients) > threshold]

# print(f"Selected features using Ridge regression: {ridge_features}")


#### 3.4.4 Combined results

> VOTING: AT LEAST 2 OUT OF 3

In [26]:
random_forest_features = ['Average Weekly Wage_log',
       'Days_between_Assembly Date_Accident Date_log', 'Age at Injury',
       'IME-4 Count', 'Attorney/Representative_binary', 'Number of Dependents',
       'First Hearing Date_nabinary', 'C-3 Date_nabinary',
       'WCIO Cause of Injury Code_encoded_2. NON-COMP',
       'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Nature of Injury Code_encoded_2. NON-COMP',
       'Industry Code_encoded_5. PPD SCH LOSS',
       'WCIO Part Of Body Code_encoded_3. MED ONLY',
       'WCIO Part Of Body Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_6. PPD NSL',
       'Industry Code_encoded_3. MED ONLY',
       'WCIO Part Of Body Code_encoded_6. PPD NSL',
       'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
       'WCIO Part Of Body Code_encoded_1. CANCELLED',
       'WCIO Part Of Body Code_encoded_2. NON-COMP',
       'Industry Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_8. DEATH',
       'Industry Code_encoded_2. NON-COMP',
       'WCIO Cause of Injury Code_encoded_7. PTD',
       'Industry Code_encoded_6. PPD NSL']

lasso_features = ['Attorney/Representative_binary',
       'Carrier Type_Self-insured Public Entity',
       'Carrier Type_State Insurance Fund', 'C-3 Date_nabinary',
       'First Hearing Date_nabinary', 'Age at Injury', 'IME-4 Count',
       'Days_between_Assembly Date_Accident Date_log',
       'Average Weekly Wage_log', 'Industry Code_encoded_5. PPD SCH LOSS',
       'Industry Code_encoded_4. TEMPORARY', 'Industry Code_encoded_8. DEATH',
       'Industry Code_encoded_6. PPD NSL',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_2. NON-COMP',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Cause of Injury Code_encoded_6. PPD NSL',
       'WCIO Nature of Injury Code_encoded_2. NON-COMP',
       'WCIO Nature of Injury Code_encoded_3. MED ONLY',
       'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
       'WCIO Nature of Injury Code_encoded_6. PPD NSL',
       'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
       'WCIO Part Of Body Code_encoded_3. MED ONLY',
       'WCIO Part Of Body Code_encoded_6. PPD NSL']

ridge_features = ['Attorney/Representative_binary',
       'Carrier Type_Self-insured Public Entity',
       'Carrier Type_State Insurance Fund', 'C-3 Date_nabinary',
       'First Hearing Date_nabinary', 'Age at Injury', 'IME-4 Count',
       'Days_between_Assembly Date_Accident Date_log',
       'Average Weekly Wage_log', 'Industry Code_encoded_5. PPD SCH LOSS',
       'Industry Code_encoded_2. NON-COMP',
       'Industry Code_encoded_3. MED ONLY',
       'Industry Code_encoded_4. TEMPORARY', 'Industry Code_encoded_8. DEATH',
       'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_2. NON-COMP',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Cause of Injury Code_encoded_8. DEATH',
       'WCIO Cause of Injury Code_encoded_6. PPD NSL',
       'WCIO Cause of Injury Code_encoded_7. PTD',
       'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Nature of Injury Code_encoded_2. NON-COMP',
       'WCIO Nature of Injury Code_encoded_3. MED ONLY',
       'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
       'WCIO Nature of Injury Code_encoded_1. CANCELLED',
       'WCIO Nature of Injury Code_encoded_8. DEATH',
       'WCIO Nature of Injury Code_encoded_6. PPD NSL',
       'WCIO Nature of Injury Code_encoded_7. PTD',
       'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
       'WCIO Part Of Body Code_encoded_2. NON-COMP',
       'WCIO Part Of Body Code_encoded_3. MED ONLY',
       'WCIO Part Of Body Code_encoded_4. TEMPORARY',
       'WCIO Part Of Body Code_encoded_1. CANCELLED',
       'WCIO Part Of Body Code_encoded_8. DEATH',
       'WCIO Part Of Body Code_encoded_6. PPD NSL',
       'WCIO Part Of Body Code_encoded_7. PTD']

mutual_info_features = ['Average Weekly Wage_log',
                  'IME-4 Count',
                  'Attorney/Representative_binary',
                  'First Hearing Date_nabinary',
                  'C-3 Date_nabinary',
                  'WCIO Nature of Injury Code_encoded_2. NON-COMP',
                  'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS',
                  'WCIO Nature of Injury Code_encoded_3. MED ONLY',
                  'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
                  'WCIO Part Of Body Code_encoded_3. MED ONLY',
                  'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
                  'WCIO Nature of Injury Code_encoded_1. CANCELLED',
                  'WCIO Part Of Body Code_encoded_2. NON-COMP',
                  'WCIO Nature of Injury Code_encoded_6. PPD NSL',
                  'WCIO Part Of Body Code_encoded_4. TEMPORARY',
                  'WCIO Part Of Body Code_encoded_1. CANCELLED',
                  'WCIO Part Of Body Code_encoded_6. PPD NSL',
                  'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
                  'WCIO Cause of Injury Code_encoded_2. NON-COMP',
                  'WCIO Cause of Injury Code_encoded_3. MED ONLY',
                  'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
                  'WCIO Cause of Injury Code_encoded_1. CANCELLED',
                  'WCIO Nature of Injury Code_encoded_7. PTD',
                  'WCIO Cause of Injury Code_encoded_6. PPD NSL',
                  'WCIO Part Of Body Code_encoded_7. PTD',
                  'WCIO Cause of Injury Code_encoded_7. PTD',
                  'WCIO Nature of Injury Code_encoded_8. DEATH',
                  'WCIO Part Of Body Code_encoded_8. DEATH',
                  'WCIO Cause of Injury Code_encoded_8. DEATH',
                  'Industry Code_encoded_2. NON-COMP',
                  'Industry Code_encoded_4. TEMPORARY',
                  'Industry Code_encoded_5. PPD SCH LOSS',
                  'Industry Code_encoded_3. MED ONLY',
                  'Industry Code_encoded_6. PPD NSL',
                  'Industry Code_encoded_8. DEATH',
                  'Industry Code_encoded_7. PTD',
                  'Days_between_Assembly Date_Accident Date_log',
                  'Age at Injury',
                  'Carrier Type_Self-insured Public Entity']

In [32]:
# Convert all feature sets to sets
rf_set = set(random_forest_features)
lasso_set = set(lasso_features)
ridge_set = set(ridge_features)
mutual_info_set = set(mutual_info_features)
xgboost_set = set(xgboost_top_features)

# Find features selected by at least three methods
final_features_set = (
    (xgboost_set & lasso_set & ridge_set) |
    (xgboost_set & lasso_set & mutual_info_set) |
    (xgboost_set & ridge_set & mutual_info_set) |
    (lasso_set & ridge_set & mutual_info_set) |
    (rf_set & xgboost_set & lasso_set) |
    (rf_set & xgboost_set & ridge_set) |
    (rf_set & xgboost_set & mutual_info_set) |
    (rf_set & lasso_set & ridge_set) |
    (rf_set & lasso_set & mutual_info_set) |
    (rf_set & ridge_set & mutual_info_set)
)


# Convert to sorted list
final_features = sorted(final_features_set)

# Display the final feature set
print(f"Features selected by 3 or more methods ({len(final_features)} features):")
print(final_features)


Features selected by 3 or more methods (35 features):
['Age at Injury', 'Attorney/Representative_binary', 'Average Weekly Wage_log', 'C-3 Date_nabinary', 'Carrier Type_Self-insured Public Entity', 'Carrier Type_State Insurance Fund', 'Days_between_Assembly Date_Accident Date_log', 'First Hearing Date_nabinary', 'IME-4 Count', 'Industry Code_encoded_2. NON-COMP', 'Industry Code_encoded_3. MED ONLY', 'Industry Code_encoded_4. TEMPORARY', 'Industry Code_encoded_5. PPD SCH LOSS', 'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_8. DEATH', 'WCIO Cause of Injury Code_encoded_1. CANCELLED', 'WCIO Cause of Injury Code_encoded_2. NON-COMP', 'WCIO Cause of Injury Code_encoded_3. MED ONLY', 'WCIO Cause of Injury Code_encoded_4. TEMPORARY', 'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Cause of Injury Code_encoded_6. PPD NSL', 'WCIO Cause of Injury Code_encoded_7. PTD', 'WCIO Cause of Injury Code_encoded_8. DEATH', 'WCIO Nature of Injury Code_encoded_1. CANCELLED', 'WCIO Nat

In [43]:
# Put all feature sets in a list
feature_sets = [rf_set, lasso_set, ridge_set, mutual_info_set, xgboost_set]

# Count occurrences of each feature across all sets
from collections import Counter
feature_counts = Counter(feature for feature_set in feature_sets for feature in feature_set)

# Select features appearing in at least 4 sets
final_features_list = [feature for feature, count in feature_counts.items() if count >= 4]

print(f"Final selected features from at least 4 methods:\n {final_features_list}")


Final selected features from at least 4 methods:
 ['WCIO Cause of Injury Code_encoded_3. MED ONLY', 'Age at Injury', 'Industry Code_encoded_6. PPD NSL', 'WCIO Cause of Injury Code_encoded_6. PPD NSL', 'WCIO Cause of Injury Code_encoded_1. CANCELLED', 'IME-4 Count', 'WCIO Part Of Body Code_encoded_3. MED ONLY', 'Attorney/Representative_binary', 'Average Weekly Wage_log', 'Industry Code_encoded_5. PPD SCH LOSS', 'WCIO Nature of Injury Code_encoded_4. TEMPORARY', 'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS', 'Industry Code_encoded_4. TEMPORARY', 'First Hearing Date_nabinary', 'C-3 Date_nabinary', 'WCIO Part Of Body Code_encoded_6. PPD NSL', 'WCIO Cause of Injury Code_encoded_2. NON-COMP', 'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Nature of Injury Code_encoded_2. NON-COMP', 'Days_between_Assembly Date_Accident Date_log', 'Carrier Type_Self-insured Public Entity', 'WCIO Nature of Injury Code_encoded_3. MED ONLY']


In [None]:
# for col in X_train.columns:
#     plt.figure(figsize=(10, 6))
#     for target_class in df['Claim Injury Type'].unique():
#         sns.kdeplot(df[df['Claim Injury Type'] == target_class][col], label=f'Claim Injury Type {target_class}', fill=True)
#     plt.title(f'KDE Plot of {col} for Each Target Class')
#     plt.legend()
#     plt.show()


### 3.5 Final features

In [35]:
X_train = X_train[final_features]
X_val = X_val[final_features]
X_test = X_test[final_features]

TypeError: Passing a set as an indexer is not supported. Use a list instead.

In [298]:
final_features_all

['Age at Injury',
 'Attorney/Representative_binary',
 'Average Weekly Wage_log',
 'C-3 Date_nabinary',
 'Days_between_Assembly Date_Accident Date_log',
 'First Hearing Date_nabinary',
 'IME-4 Count',
 'Industry Code_encoded_4. TEMPORARY',
 'Industry Code_encoded_5. PPD SCH LOSS',
 'Industry Code_encoded_6. PPD NSL',
 'WCIO Cause of Injury Code_encoded_1. CANCELLED',
 'WCIO Cause of Injury Code_encoded_2. NON-COMP',
 'WCIO Cause of Injury Code_encoded_3. MED ONLY',
 'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
 'WCIO Cause of Injury Code_encoded_6. PPD NSL',
 'WCIO Nature of Injury Code_encoded_2. NON-COMP',
 'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
 'WCIO Part Of Body Code_encoded_3. MED ONLY',
 'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
 'WCIO Part Of Body Code_encoded_6. PPD NSL']

## Teste

In [61]:
variaveis = ['Attorney/Representative_binary',
             'Age at Injury',
             'Average Weekly Wage_log',
             'C-3 Date_nabinary',             
             'Carrier Type_Self-insured Public Entity',
             'Carrier Type_State Insurance Fund',
             'Days_between_Assembly Date_Accident Date_log',
             'First Hearing Date_nabinary',
             'IME-4 Count',
            
             'Industry Code_encoded_8. DEATH',
             'Industry Code_encoded_6. PPD NSL',

             'WCIO Cause of Injury Code_encoded_1. CANCELLED',
             'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
             'WCIO Cause of Injury Code_encoded_6. PPD NSL',
             'WCIO Cause of Injury Code_encoded_7. PTD',
             'WCIO Cause of Injury Code_encoded_8. DEATH',

             'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
             'WCIO Nature of Injury Code_encoded_6. PPD NSL',
             
             'WCIO Part Of Body Code_encoded_1. CANCELLED',
             'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
             'WCIO Part Of Body Code_encoded_6. PPD NSL',
             'WCIO Cause of Injury Code_encoded_3. MED ONLY',
             'WCIO Part Of Body Code_encoded_3. MED ONLY',

             
             'WCIO Nature of Injury Code_encoded_3. MED ONLY',
             'Industry Code_encoded_3. MED ONLY',
             'Industry Code_encoded_4. TEMPORARY',
             'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
             'WCIO Part Of Body Code_encoded_4. TEMPORARY'
             
             
             
             ]

In [44]:
variaveis = final_features_list

In [62]:
X_treino = X_train[variaveis]
X_validacao = X_val[variaveis]
X_teste = X_test[variaveis]

In [63]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
from xgboost import XGBClassifier

# Train-test sp
# Define the XGBClassifier
model = XGBClassifier()

# Train the model
print(f"Training XGBClassifier...")
model.fit(X_treino, y_train_encoded)

# Predict on the validation data
y_pred = model.predict(X_validacao)

# Evaluate the model
accuracy = accuracy_score(y_val_encoded, y_pred)
f1_macro = f1_score(y_val_encoded, y_pred, average='macro')  # Macro-averaged F1 score

# Print individual model results
print(f"XGBClassifier Accuracy: {accuracy:.4f}")
print(f"XGBClassifier Macro F1 Score: {f1_macro:.4f}")
print(classification_report(y_val_encoded, y_pred))  # Detailed report including precision, recall, and F1 score per class
print("-" * 50)

# Display the summary of results
print("\nModel Evaluation Results Summary:")
print(f"XGBClassifier: Accuracy = {accuracy:.4f}, Macro F1 Score = {f1_macro:.4f}")


Training XGBClassifier...
XGBClassifier Accuracy: 0.7833
XGBClassifier Macro F1 Score: 0.4165


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.66      0.41      0.51      3741
           1       0.84      0.98      0.90     87319
           2       0.50      0.07      0.12     20672
           3       0.72      0.85      0.78     44551
           4       0.67      0.58      0.62     14484
           5       0.15      0.00      0.00      1263
           6       0.00      0.00      0.00        29
           7       0.53      0.30      0.39       141

    accuracy                           0.78    172200
   macro avg       0.51      0.40      0.42    172200
weighted avg       0.75      0.78      0.74    172200

--------------------------------------------------

Model Evaluation Results Summary:
XGBClassifier: Accuracy = 0.7833, Macro F1 Score = 0.4165


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [92]:
y.unique()

array(['5. PPD SCH LOSS', '2. NON-COMP', '3. MED ONLY', '4. TEMPORARY',
       '1. CANCELLED', '8. DEATH', '6. PPD NSL', '7. PTD'], dtype=object)

## SMOTE

In [47]:
# Assuming `y` is your target variable
class_distribution = y_train_encoded.value_counts()
class_percentages = y_train_encoded.value_counts(normalize=True) * 100

# Combine into a single DataFrame for better visualization
distribution_df = pd.DataFrame({
    'Class': class_distribution.index,
    'Count': class_distribution.values,
    'Percentage (%)': class_percentages.values
})
print(distribution_df)


   Class   Count  Percentage (%)
0      1  202245       51.059463
1      3  101242       25.559901
2      2   47591       12.014986
3      4   33513        8.460806
4      0    8191        2.067928
5      5    2924        0.738203
6      7     324        0.081798
7      6      67        0.016915


In [82]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler



# Define undersampling strategy based on the actual class distribution
undersample_strategy = {
    1: 20224,
    3: 101242,
    2: 47591,
    4: 33513,
    0: 8191,
    5: 2924,
    7: 324,
    6:67
}

# Apply undersampling to majority classes
undersampler = RandomUnderSampler(sampling_strategy=undersample_strategy, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X_treino, y_train_encoded)

# # Define SMOTE strategy for oversampling the minority classes
# smote = SMOTE(sampling_strategy={
#     7:807,

#     6: 670,   # Oversample class '5' to 3,000
# }, random_state=42)

# # Apply SMOTE to oversample minority classes
# X_resampled, y_resampled = smote.fit_resample(X_resampled, y_resampled)

# Check the new class distribution after resampling
unique_classes_final, class_counts_final = np.unique(y_resampled, return_counts=True)
class_distribution_final = dict(zip(unique_classes_final, class_counts_final))
print("New Class Distribution after Resampling:", class_distribution_final)

New Class Distribution after Resampling: {0: 8191, 1: 20224, 2: 47591, 3: 20248, 4: 33513, 5: 2924, 6: 67, 7: 324}


In [83]:
weights = np.ones(len(y_resampled))
weights

array([1., 1., 1., ..., 1., 1., 1.])

In [87]:
majority_class_label_1 = 1  # Replace with your actual majority class label
majority_class_label_3 = 3
minority_class_label_6 = 6
minority_class_label_7 = 7


weights[y_resampled == majority_class_label_1] = 10
weights[y_resampled == majority_class_label_3] = 5
weights[y_resampled == minority_class_label_6] =2
weights[y_resampled == minority_class_label_7] = 2

weights

array([1., 1., 1., ..., 2., 2., 2.])

In [88]:
import xgboost as xgb

model = xgb.XGBClassifier(eval_metric="mlogloss")
model.fit(X_resampled, y_resampled, sample_weight=weights)

# 3. Make predictions
y_pred = model.predict(X_validacao)  # Assuming X_test is available

In [89]:

# Evaluate the model
accuracy = accuracy_score(y_val_encoded, y_pred)
f1_macro = f1_score(y_val_encoded, y_pred, average='macro')  # Macro-averaged F1 score

# Print individual model results
print(f"XGBClassifier Accuracy: {accuracy:.4f}")
print(f"XGBClassifier Macro F1 Score: {f1_macro:.4f}")
print(classification_report(y_val_encoded, y_pred))  # Detailed report including precision, recall, and F1 score per class
print("-" * 50)

# Display the summary of results
print("\nModel Evaluation Results Summary:")
print(f"XGBClassifier: Accuracy = {accuracy:.4f}, Macro F1 Score = {f1_macro:.4f}")


XGBClassifier Accuracy: 0.7758
XGBClassifier Macro F1 Score: 0.4306
              precision    recall  f1-score   support

           0       0.55      0.48      0.51      3741
           1       0.85      0.96      0.90     87319
           2       0.40      0.11      0.17     20672
           3       0.73      0.83      0.77     44551
           4       0.64      0.62      0.63     14484
           5       0.13      0.01      0.01      1263
           6       0.00      0.00      0.00        29
           7       0.43      0.45      0.44       141

    accuracy                           0.78    172200
   macro avg       0.47      0.43      0.43    172200
weighted avg       0.73      0.78      0.74    172200

--------------------------------------------------

Model Evaluation Results Summary:
XGBClassifier: Accuracy = 0.7758, Macro F1 Score = 0.4306


In [109]:
# from sklearn.metrics import accuracy_score, classification_report, f1_score
# from xgboost import XGBClassifier

# # Train-test sp
# # Define the XGBClassifier
# model = XGBClassifier()

# # Train the model
# print(f"Training XGBClassifier...")
# model.fit(X_resampled, y_resampled)

# # Predict on the validation data
# y_pred = model.predict(X_validacao)

# # Evaluate the model
# accuracy = accuracy_score(y_val_encoded, y_pred)
# f1_macro = f1_score(y_val_encoded, y_pred, average='macro')  # Macro-averaged F1 score

# # Print individual model results
# print(f"XGBClassifier Accuracy: {accuracy:.4f}")
# print(f"XGBClassifier Macro F1 Score: {f1_macro:.4f}")
# print(classification_report(y_val_encoded, y_pred))  # Detailed report including precision, recall, and F1 score per class
# print("-" * 50)

# # Display the summary of results
# print("\nModel Evaluation Results Summary:")
# print(f"XGBClassifier: Accuracy = {accuracy:.4f}, Macro F1 Score = {f1_macro:.4f}")


Training XGBClassifier...
XGBClassifier Accuracy: 0.7840
XGBClassifier Macro F1 Score: 0.4176
              precision    recall  f1-score   support

           0       0.65      0.41      0.51      3741
           1       0.84      0.98      0.90     87319
           2       0.51      0.07      0.13     20672
           3       0.72      0.85      0.78     44551
           4       0.67      0.59      0.63     14484
           5       0.07      0.00      0.00      1263
           6       0.00      0.00      0.00        29
           7       0.56      0.30      0.39       141

    accuracy                           0.78    172200
   macro avg       0.50      0.40      0.42    172200
weighted avg       0.75      0.78      0.74    172200

--------------------------------------------------

Model Evaluation Results Summary:
XGBClassifier: Accuracy = 0.7840, Macro F1 Score = 0.4176


In [320]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Define base models (XGBoost, Random Forest, Gradient Boosting, and CatBoost)
estimators = [
    ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="mlogloss")),   
    ('catboost', CatBoostClassifier(iterations=1000, random_state=42, verbose=100, early_stopping_rounds=50))  # CatBoost model
]

# Define final estimator (Logistic Regression in this case)
final_estimator = LogisticRegression(max_iter=500,random_state=42)

# Create the Stacking Classifier model
stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, n_jobs=-1)

# Train the stacking model
stacking_model.fit(X_resampled, y_resampled, sample_weight=weights)

# Predict on validation set and evaluate performance
y_pred = stacking_model.predict(X_validacao)
print(classification_report(y_val_encoded, y_pred))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.64      0.41      0.50      3741
           1       0.85      0.97      0.91     87319
           2       0.42      0.09      0.15     20672
           3       0.72      0.86      0.78     44551
           4       0.68      0.57      0.62     14484
           5       0.16      0.01      0.01      1263
           6       0.00      0.00      0.00        29
           7       0.61      0.26      0.36       141

    accuracy                           0.78    172200
   macro avg       0.51      0.40      0.42    172200
weighted avg       0.74      0.78      0.74    172200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## APPLY TO TEST

In [69]:
y_test_pred = model.predict(X_teste)

In [70]:
# Analyze the distribution of predicted classes
predictions_df = pd.DataFrame(y_test_pred, columns=["Predicted Label"])
print("Predicted Class Distribution:")
print(predictions_df["Predicted Label"].value_counts(normalize=True))


Predicted Class Distribution:
Predicted Label
1    0.799647
3    0.125062
2    0.035451
4    0.020383
0    0.018968
7    0.000438
5    0.000052
Name: proportion, dtype: float64


In [71]:
X_previsao = X_test.copy()

In [72]:
X_previsao.loc[:, 'Predicted_Claim_Injury_Type'] = y_test_pred

In [73]:
claim_injury_type_mapping = {
    '4. TEMPORARY': 3,
    '2. NON-COMP': 1,
    '5. PPD SCH LOSS': 4,
    '3. MED ONLY': 2,
    '6. PPD NSL': 5,
    '1. CANCELLED': 0,
    '8. DEATH': 7,
    '7. PTD': 6
}

# Reverse the mapping
reverse_claim_injury_type_mapping = {v: k for k, v in claim_injury_type_mapping.items()}

# Use the reversed mapping to get the original labels
X_previsao['Predicted_Claim_Injury_Type'] = X_previsao['Predicted_Claim_Injury_Type'].map(reverse_claim_injury_type_mapping)


In [74]:
X_previsao[['Predicted_Claim_Injury_Type']].to_csv('test_predictions.csv')


## 4. Model building

> Não está em 1-D o y

In [75]:
# y_train = y_train_encoded
# y_val = y_val_encoded

In [60]:
# from sklearn.metrics import accuracy_score, classification_report, f1_score
# from sklearn.model_selection import train_test_split

# # Define the models
# models = {
#     "MLPClassifier": MLPClassifier(max_iter=500),
#     "RandomForest": RandomForestClassifier(),
#     "LogisticRegression": LogisticRegression(max_iter=500),
#     "KNeighborsClassifier": KNeighborsClassifier(),
#     "DecisionTree": DecisionTreeClassifier(),
#     "GaussianNB": GaussianNB(),
#     "RidgeClassifier": RidgeClassifier()
# }

# # Dictionary to store evaluation results
# results = {}

# # Iterate over each model
# for model_name, model in models.items():
#     print(f"Training {model_name}...")
#     # Fit the model on the training data
#     model.fit(X_train, y_train)
    
#     # Predict on the validation data
#     y_pred = model.predict(X_val)
    
#     # Evaluate the model
#     accuracy = accuracy_score(y_val, y_pred)
#     f1_macro = f1_score(y_val, y_pred, average='macro')  # Macro-averaged F1 score
    
#     # Store results
#     results[model_name] = {
#         "accuracy": accuracy,
#         "f1_macro": f1_macro
#     }
    
#     # Print individual model results
#     print(f"{model_name} Accuracy: {accuracy:.4f}")
#     print(f"{model_name} Macro F1 Score: {f1_macro:.4f}")
#     print(classification_report(y_val, y_pred))  # Detailed report including precision, recall, and F1 score per class
#     print("-" * 50)

# # Display the summary of results
# print("\nModel Evaluation Results Summary:")
# for model_name, metrics in results.items():
#     print(f"{model_name}: Accuracy = {metrics['accuracy']:.4f}, Macro F1 Score = {metrics['f1_macro']:.4f}")
