## Imports

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Scaling
from sklearn.preprocessing import MinMaxScaler

 #Correlation Heatmap
from matplotlib.colors import LinearSegmentedColormap

#Statistical Test
from scipy import stats
from sklearn.impute import SimpleImputer

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import Ridge

pd.set_option('display.max_columns', None)

In [56]:
# You can download the data in the source that is linked above the table of contents

# Read in the data
X_train_encoded = pd.read_csv('../project_data/X_train_encoded.csv', delimiter=',', index_col=0)
X_val_encoded = pd.read_csv('../project_data/X_val_encoded.csv', delimiter=',', index_col=0)

y_train = pd.read_csv('../project_data/y_train.csv',delimiter=',', index_col=0)
y_val= pd.read_csv('../project_data/y_val.csv', delimiter=',', index_col=0)

X_test_encoded = pd.read_csv('../project_data/X_test_encoded.csv',index_col=0)

## 1. Separate numerical and categorical

In [57]:
claim_injury_type_mapping = {
    '4. TEMPORARY': 4-1,
    '2. NON-COMP': 2-1,
    '5. PPD SCH LOSS': 5-1,
    '3. MED ONLY': 3-1,
    '6. PPD NSL': 6-1,
    '1. CANCELLED': 1-1,
    '8. DEATH':8-1,
    '7. PTD': 7-1
}

y_train_encoded = y_train['Claim Injury Type'].map(claim_injury_type_mapping)
y_val_encoded = y_val['Claim Injury Type'].map(claim_injury_type_mapping)

> Separate columns in numerical and categorical

In [58]:
num_columns = ['Age at Injury', 'IME-4 Count', 'Number of Dependents',
               'Days_between_Assembly Date_Accident Date_log',
               'Days_between_C-2 Date_Accident Date_log', 'Average Weekly Wage_log',
               'Industry Code_encoded_5. PPD SCH LOSS',
               'Industry Code_encoded_2. NON-COMP',
               'Industry Code_encoded_3. MED ONLY',
               'Industry Code_encoded_4. TEMPORARY',
               'Industry Code_encoded_1. CANCELLED', 'Industry Code_encoded_8. DEATH',
               'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD',
               'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
               'WCIO Cause of Injury Code_encoded_2. NON-COMP',
               'WCIO Cause of Injury Code_encoded_3. MED ONLY',
               'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
               'WCIO Cause of Injury Code_encoded_1. CANCELLED',
               'WCIO Cause of Injury Code_encoded_8. DEATH',
               'WCIO Cause of Injury Code_encoded_6. PPD NSL',
               'WCIO Cause of Injury Code_encoded_7. PTD',
               'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS',
               'WCIO Nature of Injury Code_encoded_2. NON-COMP',
               'WCIO Nature of Injury Code_encoded_3. MED ONLY',
               'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
               'WCIO Nature of Injury Code_encoded_1. CANCELLED',
               'WCIO Nature of Injury Code_encoded_8. DEATH',
               'WCIO Nature of Injury Code_encoded_6. PPD NSL',
               'WCIO Nature of Injury Code_encoded_7. PTD',
               'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
               'WCIO Part Of Body Code_encoded_2. NON-COMP',
               'WCIO Part Of Body Code_encoded_3. MED ONLY',
               'WCIO Part Of Body Code_encoded_4. TEMPORARY',
               'WCIO Part Of Body Code_encoded_1. CANCELLED',
               'WCIO Part Of Body Code_encoded_8. DEATH',
               'WCIO Part Of Body Code_encoded_6. PPD NSL',
               'WCIO Part Of Body Code_encoded_7. PTD']

cat_columns = ['Alternative Dispute Resolution_binary', 
               'COVID-19 Indicator_binary',
               'Attorney/Representative_binary',
               'Carrier Type_Self-insured Private Entity',
               'Carrier Type_Self-insured Public Entity', 
               'Carrier Type_Special Funds',
               'Carrier Type_State Insurance Fund', 
               'C-3 Date_nabinary',
               'First Hearing Date_nabinary', 
               'Accident Date_Season_Spring',
               'Accident Date_Season_Summer', 
               'Accident Date_Season_Winter']


# Create subsets
X_train_num = X_train_encoded[num_columns]
X_train_cat = X_train_encoded[cat_columns]

X_val_num = X_val_encoded[num_columns]
X_val_cat = X_val_encoded[cat_columns]

X_test_num=X_test_encoded[num_columns]
X_test_cat=X_test_encoded[cat_columns]

### 1.1 Smooth

In [59]:
# lambda_smooth = 0.1  # Smoothing parameter
# global_mean = y_train_encoded.mean()  # Global mean of the target variable

# # List of features to smooth
# to_smooth = [
#     'WCIO Part Of Body Code_encoded_8. DEATH',
#     'WCIO Part Of Body Code_encoded_6. PPD NSL',
#     'WCIO Part Of Body Code_encoded_7. PTD',
#     'WCIO Nature of Injury Code_encoded_8. DEATH',
#     'WCIO Nature of Injury Code_encoded_6. PPD NSL',
#     'WCIO Nature of Injury Code_encoded_7. PTD',
#     'WCIO Cause of Injury Code_encoded_8. DEATH',
#     'WCIO Cause of Injury Code_encoded_6. PPD NSL',
#     'WCIO Cause of Injury Code_encoded_7. PTD',
#     'Industry Code_encoded_8. DEATH',
#     'Industry Code_encoded_6. PPD NSL', 
#     'Industry Code_encoded_7. PTD'
# ]

# # Apply smoothing for each column in the to_smooth list
# for column in to_smooth:
#     if column in X_train_num.columns:  # Check if column exists in the DataFrame
#         # Get unique categories in the column
#         unique_categories = X_train_num[column].unique()
        
#         # Apply smoothing for each category
#         for category in unique_categories:
#             # Get the target values for the current category
#             category_target_values = y_train_encoded[X_train_num[column] == category]
            
#             # Compute the category mean (mean target value for this category)
#             category_mean = category_target_values.mean()
            
#             # Number of instances in this category (N_i)
#             N_i = len(category_target_values)
            
#             # Apply the smoothing formula
#             smoothed_value = (N_i * category_mean + lambda_smooth * global_mean) / (N_i + lambda_smooth)
            
#             # Replace the category with the smoothed value using .loc to avoid SettingWithCopyWarning
#             X_train_num.loc[X_train_num[column] == category, column] = smoothed_value
#     else:
#         print(f"Column {column} not found in the DataFrame!")

# # After smoothing, inspect the DataFrame to confirm
# print(X_train_num.head())


Convert y_train dataframe to series

In [60]:
y = y_train.squeeze()  # Converts a DataFrame with one column to a Series

In [61]:
# Percentage distribution of each class in the target variable
class_percentage = y.value_counts(normalize=True) * 100
print(class_percentage)


Claim Injury Type
2. NON-COMP        51.059463
4. TEMPORARY       25.559901
3. MED ONLY        12.014986
5. PPD SCH LOSS     8.460806
1. CANCELLED        2.067928
6. PPD NSL          0.738203
8. DEATH            0.081798
7. PTD              0.016915
Name: proportion, dtype: float64


## 2. Data scaling

### 2.1 Normalization

> Not all models need the variables to be scaled.

In [62]:
scaler = MinMaxScaler().fit(X_train_num)
X_train_num_scaled = scaler.transform(X_train_num)
print("Parameters fitted:")
for feature, min_val, max_val in zip(X_train_num.columns, scaler.data_min_, scaler.data_max_):
    print(f"Variable: {feature} | Min: {min_val} | Max: {max_val}")

# Convert the array to a pandas dataframe
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns = X_train_num.columns).set_index(X_train_encoded.index)
X_train_num_scaled.describe().round(2)

Parameters fitted:
Variable: Age at Injury | Min: 14.0 | Max: 83.0
Variable: IME-4 Count | Min: 0.0 | Max: 20.0
Variable: Number of Dependents | Min: 0.0 | Max: 6.0
Variable: Days_between_Assembly Date_Accident Date_log | Min: 0.6931471805599453 | Max: 7.371489295214277
Variable: Days_between_C-2 Date_Accident Date_log | Min: 0.6931471805599453 | Max: 7.510430556378006
Variable: Average Weekly Wage_log | Min: 0.6931471805599453 | Max: 8.58443493056553
Variable: Industry Code_encoded_5. PPD SCH LOSS | Min: 0.0365412641010417 | Max: 0.2039048872295498
Variable: Industry Code_encoded_2. NON-COMP | Min: 0.3646337528234915 | Max: 0.6544579016489129
Variable: Industry Code_encoded_3. MED ONLY | Min: 0.0909513480012395 | Max: 0.1584569732937685
Variable: Industry Code_encoded_4. TEMPORARY | Min: 0.1535094119363782 | Max: 0.3952427050200525
Variable: Industry Code_encoded_1. CANCELLED | Min: 0.0120874904067536 | Max: 0.1322957198386701
Variable: Industry Code_encoded_8. DEATH | Min: 4.16835290

Unnamed: 0,Age at Injury,IME-4 Count,Number of Dependents,Days_between_Assembly Date_Accident Date_log,Days_between_C-2 Date_Accident Date_log,Average Weekly Wage_log,Industry Code_encoded_5. PPD SCH LOSS,Industry Code_encoded_2. NON-COMP,Industry Code_encoded_3. MED ONLY,Industry Code_encoded_4. TEMPORARY,Industry Code_encoded_1. CANCELLED,Industry Code_encoded_8. DEATH,Industry Code_encoded_6. PPD NSL,Industry Code_encoded_7. PTD,WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS,WCIO Cause of Injury Code_encoded_2. NON-COMP,WCIO Cause of Injury Code_encoded_3. MED ONLY,WCIO Cause of Injury Code_encoded_4. TEMPORARY,WCIO Cause of Injury Code_encoded_1. CANCELLED,WCIO Cause of Injury Code_encoded_8. DEATH,WCIO Cause of Injury Code_encoded_6. PPD NSL,WCIO Cause of Injury Code_encoded_7. PTD,WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS,WCIO Nature of Injury Code_encoded_2. NON-COMP,WCIO Nature of Injury Code_encoded_3. MED ONLY,WCIO Nature of Injury Code_encoded_4. TEMPORARY,WCIO Nature of Injury Code_encoded_1. CANCELLED,WCIO Nature of Injury Code_encoded_8. DEATH,WCIO Nature of Injury Code_encoded_6. PPD NSL,WCIO Nature of Injury Code_encoded_7. PTD,WCIO Part Of Body Code_encoded_5. PPD SCH LOSS,WCIO Part Of Body Code_encoded_2. NON-COMP,WCIO Part Of Body Code_encoded_3. MED ONLY,WCIO Part Of Body Code_encoded_4. TEMPORARY,WCIO Part Of Body Code_encoded_1. CANCELLED,WCIO Part Of Body Code_encoded_8. DEATH,WCIO Part Of Body Code_encoded_6. PPD NSL,WCIO Part Of Body Code_encoded_7. PTD
count,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0,396097.0
mean,0.41,0.04,0.5,0.3,0.3,0.33,0.29,0.5,0.43,0.42,0.07,0.2,0.55,0.26,0.18,0.43,0.51,0.37,0.04,0.01,0.16,0.01,0.17,0.52,0.58,0.45,0.04,0.01,0.28,0.0,0.23,0.34,0.61,0.54,0.04,0.02,0.15,0.03
std,0.2,0.09,0.33,0.16,0.17,0.4,0.34,0.31,0.2,0.25,0.07,0.14,0.19,0.17,0.13,0.18,0.13,0.13,0.15,0.02,0.11,0.02,0.11,0.19,0.15,0.2,0.15,0.03,0.18,0.02,0.19,0.24,0.15,0.19,0.15,0.07,0.16,0.06
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.0,0.17,0.19,0.18,0.0,0.06,0.17,0.27,0.26,0.03,0.12,0.4,0.15,0.13,0.31,0.47,0.31,0.01,0.0,0.06,0.0,0.06,0.41,0.55,0.34,0.02,0.0,0.09,0.0,0.04,0.14,0.57,0.37,0.01,0.0,0.03,0.0
50%,0.41,0.0,0.5,0.27,0.26,0.0,0.12,0.54,0.45,0.46,0.05,0.15,0.63,0.27,0.17,0.38,0.54,0.38,0.02,0.0,0.16,0.01,0.19,0.53,0.6,0.44,0.02,0.0,0.26,0.0,0.22,0.25,0.61,0.58,0.02,0.0,0.1,0.01
75%,0.58,0.0,0.83,0.38,0.4,0.79,0.33,0.74,0.64,0.55,0.09,0.22,0.67,0.3,0.21,0.52,0.56,0.45,0.02,0.0,0.21,0.01,0.25,0.56,0.66,0.57,0.02,0.0,0.44,0.0,0.31,0.54,0.72,0.68,0.03,0.03,0.24,0.03
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [63]:
X_val_num_scaled = scaler.transform(X_val_num)
X_val_num_scaled = pd.DataFrame(X_val_num_scaled, columns = X_val_num.columns).set_index(X_val_encoded.index)
X_val_num_scaled.describe().round(2)

Unnamed: 0,Age at Injury,IME-4 Count,Number of Dependents,Days_between_Assembly Date_Accident Date_log,Days_between_C-2 Date_Accident Date_log,Average Weekly Wage_log,Industry Code_encoded_5. PPD SCH LOSS,Industry Code_encoded_2. NON-COMP,Industry Code_encoded_3. MED ONLY,Industry Code_encoded_4. TEMPORARY,Industry Code_encoded_1. CANCELLED,Industry Code_encoded_8. DEATH,Industry Code_encoded_6. PPD NSL,Industry Code_encoded_7. PTD,WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS,WCIO Cause of Injury Code_encoded_2. NON-COMP,WCIO Cause of Injury Code_encoded_3. MED ONLY,WCIO Cause of Injury Code_encoded_4. TEMPORARY,WCIO Cause of Injury Code_encoded_1. CANCELLED,WCIO Cause of Injury Code_encoded_8. DEATH,WCIO Cause of Injury Code_encoded_6. PPD NSL,WCIO Cause of Injury Code_encoded_7. PTD,WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS,WCIO Nature of Injury Code_encoded_2. NON-COMP,WCIO Nature of Injury Code_encoded_3. MED ONLY,WCIO Nature of Injury Code_encoded_4. TEMPORARY,WCIO Nature of Injury Code_encoded_1. CANCELLED,WCIO Nature of Injury Code_encoded_8. DEATH,WCIO Nature of Injury Code_encoded_6. PPD NSL,WCIO Nature of Injury Code_encoded_7. PTD,WCIO Part Of Body Code_encoded_5. PPD SCH LOSS,WCIO Part Of Body Code_encoded_2. NON-COMP,WCIO Part Of Body Code_encoded_3. MED ONLY,WCIO Part Of Body Code_encoded_4. TEMPORARY,WCIO Part Of Body Code_encoded_1. CANCELLED,WCIO Part Of Body Code_encoded_8. DEATH,WCIO Part Of Body Code_encoded_6. PPD NSL,WCIO Part Of Body Code_encoded_7. PTD
count,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0,172200.0
mean,0.41,0.04,0.5,0.31,0.3,0.33,0.28,0.5,0.43,0.42,0.07,0.2,0.55,0.26,0.18,0.43,0.5,0.37,0.05,0.01,0.16,0.01,0.17,0.52,0.57,0.45,0.05,0.01,0.28,0.0,0.23,0.34,0.61,0.54,0.05,0.02,0.15,0.03
std,0.2,0.1,0.33,0.18,0.17,0.4,0.34,0.31,0.2,0.25,0.07,0.14,0.19,0.17,0.13,0.18,0.13,0.13,0.16,0.02,0.11,0.02,0.12,0.19,0.15,0.2,0.16,0.03,0.18,0.02,0.19,0.24,0.16,0.19,0.16,0.07,0.16,0.06
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.0,0.17,0.19,0.18,0.0,0.06,0.17,0.27,0.26,0.03,0.12,0.4,0.15,0.13,0.31,0.47,0.31,0.01,0.0,0.05,0.0,0.06,0.41,0.55,0.34,0.02,0.0,0.09,0.0,0.04,0.14,0.55,0.37,0.01,0.0,0.03,0.0
50%,0.41,0.0,0.5,0.27,0.26,0.0,0.12,0.54,0.45,0.46,0.05,0.15,0.63,0.27,0.17,0.38,0.54,0.38,0.02,0.0,0.16,0.01,0.19,0.53,0.6,0.44,0.02,0.0,0.26,0.0,0.22,0.26,0.61,0.58,0.02,0.0,0.1,0.01
75%,0.58,0.0,0.83,0.38,0.41,0.79,0.33,0.74,0.63,0.55,0.09,0.22,0.67,0.3,0.21,0.52,0.56,0.45,0.02,0.0,0.21,0.01,0.25,0.56,0.66,0.57,0.02,0.0,0.44,0.0,0.31,0.54,0.72,0.68,0.03,0.03,0.24,0.03
max,1.0,1.0,1.0,1.01,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [64]:
X_test_num_scaled = scaler.transform(X_test_num)
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns = X_test_num.columns).set_index(X_test_encoded.index)
X_test_num_scaled.describe().round(2)

Unnamed: 0,Age at Injury,IME-4 Count,Number of Dependents,Days_between_Assembly Date_Accident Date_log,Days_between_C-2 Date_Accident Date_log,Average Weekly Wage_log,Industry Code_encoded_5. PPD SCH LOSS,Industry Code_encoded_2. NON-COMP,Industry Code_encoded_3. MED ONLY,Industry Code_encoded_4. TEMPORARY,Industry Code_encoded_1. CANCELLED,Industry Code_encoded_8. DEATH,Industry Code_encoded_6. PPD NSL,Industry Code_encoded_7. PTD,WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS,WCIO Cause of Injury Code_encoded_2. NON-COMP,WCIO Cause of Injury Code_encoded_3. MED ONLY,WCIO Cause of Injury Code_encoded_4. TEMPORARY,WCIO Cause of Injury Code_encoded_1. CANCELLED,WCIO Cause of Injury Code_encoded_8. DEATH,WCIO Cause of Injury Code_encoded_6. PPD NSL,WCIO Cause of Injury Code_encoded_7. PTD,WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS,WCIO Nature of Injury Code_encoded_2. NON-COMP,WCIO Nature of Injury Code_encoded_3. MED ONLY,WCIO Nature of Injury Code_encoded_4. TEMPORARY,WCIO Nature of Injury Code_encoded_1. CANCELLED,WCIO Nature of Injury Code_encoded_8. DEATH,WCIO Nature of Injury Code_encoded_6. PPD NSL,WCIO Nature of Injury Code_encoded_7. PTD,WCIO Part Of Body Code_encoded_5. PPD SCH LOSS,WCIO Part Of Body Code_encoded_2. NON-COMP,WCIO Part Of Body Code_encoded_3. MED ONLY,WCIO Part Of Body Code_encoded_4. TEMPORARY,WCIO Part Of Body Code_encoded_1. CANCELLED,WCIO Part Of Body Code_encoded_8. DEATH,WCIO Part Of Body Code_encoded_6. PPD NSL,WCIO Part Of Body Code_encoded_7. PTD
count,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0
mean,0.4,0.01,0.5,0.29,0.28,0.15,0.26,0.54,0.44,0.4,0.07,0.19,0.54,0.25,0.18,0.44,0.51,0.36,0.04,0.0,0.16,0.01,0.17,0.53,0.58,0.43,0.04,0.01,0.28,0.0,0.23,0.34,0.63,0.54,0.04,0.02,0.15,0.02
std,0.2,0.04,0.33,0.17,0.16,0.32,0.32,0.31,0.19,0.26,0.07,0.13,0.19,0.17,0.12,0.18,0.13,0.14,0.15,0.02,0.11,0.02,0.11,0.19,0.15,0.2,0.15,0.03,0.17,0.02,0.18,0.24,0.15,0.19,0.15,0.05,0.15,0.05
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.23,0.0,0.17,0.16,0.16,0.0,0.06,0.17,0.27,0.26,0.03,0.08,0.4,0.15,0.13,0.31,0.48,0.31,0.01,0.0,0.06,0.0,0.06,0.41,0.56,0.34,0.02,0.0,0.17,0.0,0.05,0.12,0.57,0.37,0.01,0.0,0.03,0.0
50%,0.39,0.0,0.5,0.24,0.25,0.0,0.11,0.62,0.45,0.4,0.05,0.15,0.58,0.22,0.17,0.38,0.54,0.37,0.02,0.0,0.18,0.01,0.19,0.55,0.6,0.44,0.02,0.0,0.26,0.0,0.22,0.25,0.63,0.59,0.02,0.0,0.1,0.01
75%,0.57,0.0,0.83,0.36,0.37,0.0,0.33,0.8,0.64,0.48,0.09,0.22,0.67,0.3,0.2,0.52,0.56,0.45,0.02,0.0,0.21,0.01,0.25,0.56,0.66,0.57,0.02,0.0,0.44,0.0,0.31,0.54,0.72,0.68,0.03,0.03,0.24,0.03
max,1.0,0.9,1.0,1.05,1.02,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### 2.2 Standarization

> We can use both, if we're using algorithms that benefit from using both.  (e.g., neural networks may benefit from normalization, while models like logistic regression may perform better with standardized data).

In [65]:
# from sklearn.preprocessing import StandardScaler

# std_scaler = StandardScaler().fit(X_train_num)
# X_train_scaled_std= scaler.transform(X_train_num)
# X_train_scaled_std = pd.DataFrame(X_train_scaled_std, columns = X_train_num.columns).set_index(X_train_encoded.index)

# X_val_scaled_std = scaler.transform(X_val_num)
# X_val_scaled_std = pd.DataFrame(X_val_scaled_std, columns = X_val_num.columns).set_index(X_val_encoded.index)


## 3. Feature Selection

### 3.1 Categorical: Filter Methods

#### 3.1.1 Chi square and Cramer's V

In [66]:
# import pandas as pd
# import numpy as np
# from sklearn.feature_selection import chi2
# from scipy.stats import chi2_contingency

# def cramers_v(X, y):
#     # Create the contingency table
#     df_contingency = pd.crosstab(X, y)
    
#     # Perform chi-square test
#     chi2, p, dof, expected = chi2_contingency(df_contingency.values)
    
#     # Calculate Cramér's V
#     n = df_contingency.sum().sum()  # Total number of observations
#     min_dim = min(df_contingency.shape) - 1  # Min between number of rows and columns - 1
#     cramers_v = np.sqrt(chi2 / (n * min_dim))  # Cramér's V formula
    
#     return cramers_v

# # Example for X_train_cat (categorical features) and y_train (target)
# # Perform Chi-square test
# chi2_values, p_values = chi2(X_train_cat, y)

# # Create a DataFrame with Chi-square results
# chi2_results = pd.DataFrame({
#     'Column': X_train_cat.columns,
#     'Chi2': chi2_values.round(5),
#     'p-value': p_values.round(5)
# })

# # Calculate Cramér's V for each feature
# cramers_v_values = []
# for var in X_train_cat.columns:
#     v = cramers_v(X_train_cat[var], y)
#     cramers_v_values.append(v)

# # Add Cramér's V results to the DataFrame
# chi2_results['Cramér\'s V'] = cramers_v_values

# # Set threshold for Cramér's V (e.g., 0.1 for weak association, 0.3 for moderate, etc.)
# cramers_v_threshold = 0.25

# # Filter features based on p-value < 0.05 (Chi-square) and strong association (Cramér's V)
# important_features = chi2_results[(chi2_results['p-value'] < 0.05) & (chi2_results['Cramér\'s V'] >= cramers_v_threshold)]

# # Features to consider removing (p-value < 0.05 but weak association)
# remove_features = chi2_results[(chi2_results['p-value'] < 0.05) & (chi2_results['Cramér\'s V'] < cramers_v_threshold)]

# # Features with p-value >= 0.05 (Chi-square) can generally be discarded
# not_important_features = chi2_results[chi2_results['p-value'] >= 0.05]

# # Print results
# print("Important Features (p-value < 0.05 & Cramér's V >= 0.25):")
# print(important_features[['Column', 'Chi2', 'p-value', 'Cramér\'s V']])

# print("\nFeatures to Consider Removing (p-value < 0.05 but weak Cramér's V < 0.25):")
# print(remove_features[['Column', 'Chi2', 'p-value', 'Cramér\'s V']])

# print("\nNot Important Features (p-value >= 0.05):")
# print(not_important_features[['Column', 'Chi2', 'p-value']])


#### 3.1.2 Mutual Information

In [67]:
# import pandas as pd
# from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
# from sklearn.preprocessing import LabelEncoder

# # Assuming X_train_cat is your categorical feature set and y_train is the target
# # For classification problems:
# mi = mutual_info_classif(X_train_cat, y,discrete_features=True)

# # Create a DataFrame with the results
# mi_results = pd.DataFrame({
#     'Feature': X_train_cat.columns,
#     'Mutual Information': mi
# })

# # Sort the features by their mutual information score
# mi_results = mi_results.sort_values(by='Mutual Information', ascending=False)

# # Display the top features based on MI
# print("Mutual Information Results:")
# mi_results


> For the categorical we should look into keeping with:
1. Chi test: p-value < 0.05
2. Cramer's V > 0.1
3. Mutual information: above 0.1

#### 3.1.3 Drop categorical

In [68]:
# List of features to drop from X_train_cat
cat_features_to_drop = [
    'COVID-19 Indicator_binary',
    'Alternative Dispute Resolution_binary',
    'Carrier Type_Self-insured Private Entity',
    'Accident Date_Season_Winter',
    'Carrier Type_Special Funds',
    'Accident Date_Season_Spring',
    'Accident Date_Season_Summer'
]

# Drop features
X_train_cat_filtered = X_train_cat.drop(columns=cat_features_to_drop)
X_val_cat_filtered = X_val_cat.drop(columns=cat_features_to_drop)
X_test_cat_filtered = X_test_cat.drop(columns=cat_features_to_drop)

# Verify the remaining columns
print("Remaining features in X_train_cat:", X_train_cat_filtered.columns.tolist())


Remaining features in X_train_cat: ['Attorney/Representative_binary', 'Carrier Type_Self-insured Public Entity', 'Carrier Type_State Insurance Fund', 'C-3 Date_nabinary', 'First Hearing Date_nabinary']


### 3.2 Numerical: Filter Methods

#### 3.2.1 Univariate

In [69]:
# X_train_num_scaled.var().sort_values(ascending=False)

#### 3.2.2 Spearman Correlation

In [70]:
# # Flatten the correlation matrix and reset the index
# correlation_pairs = cor_spearman.unstack().reset_index()

# # Rename columns for clarity
# correlation_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']

# # Filter the table for correlations > 0.5 or < -0.5 and exclude self-correlations (diagonal)
# strong_correlations = correlation_pairs[
#     ((correlation_pairs['Correlation'] > 0.8) | (correlation_pairs['Correlation'] < -0.8)) &
#     (correlation_pairs['Feature_1'] != correlation_pairs['Feature_2'])
# ]

# # Drop duplicate pairs (keep unique pairs only)
# strong_correlations = strong_correlations.sort_values(by='Correlation', ascending=False).drop_duplicates(subset=['Correlation'])

# # Display the table
# strong_correlations


#### 3.2.3 Mutual Information

In [71]:
# from sklearn.feature_selection import mutual_info_classif

# # Compute mutual information between categorical features and target
# mutual_info = mutual_info_classif(X_train_num_scaled, y_train_encoded)

# # Display features sorted by mutual information
# mi_results = pd.DataFrame({
#     'Feature': X_train_num_scaled.columns,
#     'Mutual Information': mutual_info
# }).sort_values(by='Mutual Information', ascending=False)

# mi_results

#### 3.2.4 Drop numerical

In [72]:
# List of features to drop from X_train_num_scaled
num_features_to_drop = [
    'Industry Code_encoded_1. CANCELLED',
    'Days_between_C-2 Date_Accident Date_log'
]

# Drop features
X_train_num_scaled_filtered = X_train_num_scaled.drop(columns=num_features_to_drop)
X_val_num_scaled_filtered = X_val_num_scaled.drop(columns=num_features_to_drop)
X_test_num_scaled_filtered = X_test_num_scaled.drop(columns=num_features_to_drop)

# Verify the remaining columns
print("Remaining features in X_train_num_scaled:", X_train_num_scaled_filtered.columns.tolist())


Remaining features in X_train_num_scaled: ['Age at Injury', 'IME-4 Count', 'Number of Dependents', 'Days_between_Assembly Date_Accident Date_log', 'Average Weekly Wage_log', 'Industry Code_encoded_5. PPD SCH LOSS', 'Industry Code_encoded_2. NON-COMP', 'Industry Code_encoded_3. MED ONLY', 'Industry Code_encoded_4. TEMPORARY', 'Industry Code_encoded_8. DEATH', 'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD', 'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Cause of Injury Code_encoded_2. NON-COMP', 'WCIO Cause of Injury Code_encoded_3. MED ONLY', 'WCIO Cause of Injury Code_encoded_4. TEMPORARY', 'WCIO Cause of Injury Code_encoded_1. CANCELLED', 'WCIO Cause of Injury Code_encoded_8. DEATH', 'WCIO Cause of Injury Code_encoded_6. PPD NSL', 'WCIO Cause of Injury Code_encoded_7. PTD', 'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Nature of Injury Code_encoded_2. NON-COMP', 'WCIO Nature of Injury Code_encoded_3. MED ONLY', 'WCIO Nature of Injury Code_

### 3.3 Combine the filtered datasets

In [73]:
# Combine the filtered datasets
X_train = pd.concat([X_train_cat_filtered, X_train_num_scaled_filtered], axis=1)
X_val =  pd.concat([X_val_cat_filtered, X_val_num_scaled_filtered], axis=1)
X_test =  pd.concat([X_test_cat_filtered, X_test_num_scaled_filtered], axis=1)


# Verify the shape of the combined dataset
print("Shape of combined X_train:", X_train.shape)


Shape of combined X_train: (396097, 41)


### 3.4 Random forest, LASSO, Ridge

#### 3.4.1 Random Forest

In [74]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=47)

# Fit the model on X_train and y_train_encoded
rf_model.fit(X_train, y_train_encoded)

# Get feature importances
importances = rf_model.feature_importances_

# Sort feature importances in descending order
important_indices = importances.argsort()[::-1]

# Print the feature importances
print("Feature importances:", importances)
print("Sorted feature indices:", important_indices)


Feature importances: [0.04505945 0.00655995 0.00784613 0.02553499 0.04005868 0.0687418
 0.06254693 0.0402515  0.06924674 0.27647899 0.01427637 0.01065158
 0.01257681 0.01111369 0.01016763 0.01037132 0.00998132 0.0152133
 0.01543345 0.01523031 0.01531065 0.01520027 0.01109206 0.01363941
 0.01052917 0.00792599 0.01489035 0.00824583 0.0121568  0.00843617
 0.00550532 0.00667489 0.00570451 0.01538164 0.01191221 0.01384076
 0.01364452 0.01204579 0.00924222 0.01241826 0.00886222]
Sorted feature indices: [ 9  8  5  6  0  7  4  3 18 33 20 19 17 21 26 10 35 36 23 12 39 28 37 34
 13 22 11 24 15 14 16 38 40 29 27 25  2 31  1 32 30]


In [75]:
import numpy as np

# Calculate cumulative sum of feature importances
cumulative_importance = np.cumsum(importances[important_indices])

# Find the index where cumulative importance exceeds 90%
threshold = 0.90
index_90 = np.argmax(cumulative_importance >= threshold)

# The number of features that explain 90% of the importance
num_features_90 = index_90 + 1  # Adding 1 since index starts at 0

print(f"Number of features explaining 90% of cumulative importance: {num_features_90}")

# Select the top features based on this number
random_forest_features = X_train.columns[important_indices[:num_features_90]]
print(f"Selected features that explain 90% of cumulative importance: {random_forest_features}")


Number of features explaining 90% of cumulative importance: 29
Selected features that explain 90% of cumulative importance: Index(['Average Weekly Wage_log',
       'Days_between_Assembly Date_Accident Date_log', 'Age at Injury',
       'IME-4 Count', 'Attorney/Representative_binary', 'Number of Dependents',
       'First Hearing Date_nabinary', 'C-3 Date_nabinary',
       'WCIO Cause of Injury Code_encoded_2. NON-COMP',
       'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Nature of Injury Code_encoded_2. NON-COMP',
       'Industry Code_encoded_5. PPD SCH LOSS',
       'WCIO Part Of Body Code_encoded_3. MED ONLY',
       'WCIO Part Of Body Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_6. PPD NSL',
       'Industry Code

#### 3.4.2 Lasso

In [76]:
from sklearn.linear_model import LassoCV
lasso = LassoCV(cv=5)  # Cross-validation to select optimal alpha
lasso.fit(X_train, y_train_encoded)
lasso_features = X_train.columns[lasso.coef_ != 0]
print("Selected features by Lasso:", lasso_features)

Selected features by Lasso: Index(['Attorney/Representative_binary',
       'Carrier Type_Self-insured Public Entity',
       'Carrier Type_State Insurance Fund', 'C-3 Date_nabinary',
       'First Hearing Date_nabinary', 'Age at Injury', 'IME-4 Count',
       'Days_between_Assembly Date_Accident Date_log',
       'Average Weekly Wage_log', 'Industry Code_encoded_5. PPD SCH LOSS',
       'Industry Code_encoded_4. TEMPORARY', 'Industry Code_encoded_8. DEATH',
       'Industry Code_encoded_6. PPD NSL',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_2. NON-COMP',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Cause of Injury Code_encoded_6. PPD NSL',
       'WCIO Nature of Injury Code_encoded_2. NON-COMP',
       'WCIO Nature of Injury Code_encoded_3. MED ONLY',
       'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
       'WCIO Nature of Injury Code_enc

#### 3.4.3 Ridge

In [77]:
from sklearn.linear_model import RidgeCV
import numpy as np

# Initialize Ridge regression model with cross-validation
ridge_model = RidgeCV(cv=5)

# Fit the model on the training data
ridge_model.fit(X_train, y_train_encoded)

# Get the feature coefficients
coefficients = ridge_model.coef_

# Set a threshold to select features (e.g., absolute coefficient > 0.01)
threshold = 0.001
ridge_features = X_train.columns[np.abs(coefficients) > threshold]

print(f"Selected features using Ridge regression: {ridge_features}")


Selected features using Ridge regression: Index(['Attorney/Representative_binary',
       'Carrier Type_Self-insured Public Entity',
       'Carrier Type_State Insurance Fund', 'C-3 Date_nabinary',
       'First Hearing Date_nabinary', 'Age at Injury', 'IME-4 Count',
       'Days_between_Assembly Date_Accident Date_log',
       'Average Weekly Wage_log', 'Industry Code_encoded_5. PPD SCH LOSS',
       'Industry Code_encoded_2. NON-COMP',
       'Industry Code_encoded_3. MED ONLY',
       'Industry Code_encoded_4. TEMPORARY', 'Industry Code_encoded_8. DEATH',
       'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_2. NON-COMP',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Cause of Injury Code_encoded_8. DEATH',
       'WCIO Cause of

#### 3.4.4 Combined results

> VOTING: AT LEAST 2 OUT OF 3

In [78]:
rf_set = set(random_forest_features)

lasso_set = set(lasso_features)
ridge_set = set(ridge_features)

# Find the features selected by at least two of the three methods
intersection = (lasso_set & ridge_set) | (lasso_set & rf_set) | (ridge_set & rf_set)

# Convert to list (if needed) and print
selected_features = list(intersection)
print(f"Features selected by at least two models: {selected_features}")

Features selected by at least two models: ['Carrier Type_Self-insured Public Entity', 'WCIO Nature of Injury Code_encoded_2. NON-COMP', 'Industry Code_encoded_4. TEMPORARY', 'WCIO Cause of Injury Code_encoded_6. PPD NSL', 'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS', 'WCIO Nature of Injury Code_encoded_4. TEMPORARY', 'Days_between_Assembly Date_Accident Date_log', 'WCIO Cause of Injury Code_encoded_3. MED ONLY', 'WCIO Part Of Body Code_encoded_4. TEMPORARY', 'Industry Code_encoded_2. NON-COMP', 'WCIO Part Of Body Code_encoded_6. PPD NSL', 'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Nature of Injury Code_encoded_3. MED ONLY', 'WCIO Cause of Injury Code_encoded_7. PTD', 'Age at Injury', 'WCIO Cause of Injury Code_encoded_1. CANCELLED', 'WCIO Part Of Body Code_encoded_2. NON-COMP', 'First Hearing Date_nabinary', 'WCIO Part Of Body Code_encoded_3. MED ONLY', 'WCIO Cause of Injury Code_encoded_2. NON-COMP', 'WCIO Nature of Injury Code_encoded_6. PPD NSL', 'Carrier Type_St

### 3.5 Final features

In [79]:
X_train = X_train[selected_features]
X_val = X_val[selected_features]
X_test = X_test[selected_features]

## 4. Model building

> Não está em 1-D o y

In [27]:
# y_train = y_train_encoded
# y_val = y_val_encoded

In [None]:
# from sklearn.metrics import accuracy_score, classification_report, f1_score
# from sklearn.model_selection import train_test_split

# # Define the models
# models = {
#     "MLPClassifier": MLPClassifier(max_iter=500),
#     "RandomForest": RandomForestClassifier(),
#     "LogisticRegression": LogisticRegression(max_iter=500),
#     "KNeighborsClassifier": KNeighborsClassifier(),
#     "DecisionTree": DecisionTreeClassifier(),
#     "GaussianNB": GaussianNB(),
#     "RidgeClassifier": RidgeClassifier()
# }

# # Dictionary to store evaluation results
# results = {}

# # Iterate over each model
# for model_name, model in models.items():
#     print(f"Training {model_name}...")
#     # Fit the model on the training data
#     model.fit(X_train, y_train)
    
#     # Predict on the validation data
#     y_pred = model.predict(X_val)
    
#     # Evaluate the model
#     accuracy = accuracy_score(y_val, y_pred)
#     f1_macro = f1_score(y_val, y_pred, average='macro')  # Macro-averaged F1 score
    
#     # Store results
#     results[model_name] = {
#         "accuracy": accuracy,
#         "f1_macro": f1_macro
#     }
    
#     # Print individual model results
#     print(f"{model_name} Accuracy: {accuracy:.4f}")
#     print(f"{model_name} Macro F1 Score: {f1_macro:.4f}")
#     print(classification_report(y_val, y_pred))  # Detailed report including precision, recall, and F1 score per class
#     print("-" * 50)

# # Display the summary of results
# print("\nModel Evaluation Results Summary:")
# for model_name, metrics in results.items():
#     print(f"{model_name}: Accuracy = {metrics['accuracy']:.4f}, Macro F1 Score = {metrics['f1_macro']:.4f}")
