## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Scaling
from sklearn.preprocessing import MinMaxScaler

 #Correlation Heatmap
from matplotlib.colors import LinearSegmentedColormap

#Statistical Test
from scipy import stats
from sklearn.impute import SimpleImputer

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import Ridge

pd.set_option('display.max_columns', None)

In [2]:
# You can download the data in the source that is linked above the table of contents

# Read in the data
X_val_encoded = pd.read_csv('../project_data/X_val_encoded.csv', delimiter=',', index_col=0)
X_train_encoded = pd.read_csv('../project_data/X_train_encoded.csv', delimiter=',', index_col=0)

y_train = pd.read_csv('../project_data/y_train.csv',delimiter=',', index_col=0)
y_val= pd.read_csv('../project_data/y_val.csv', delimiter=',', index_col=0)

X_test_encoded = pd.read_csv('../project_data/X_test_encoded.csv',index_col=0)

## 1. Target encoding

In [3]:
claim_injury_type_mapping = {
    '4. TEMPORARY': 4,
    '2. NON-COMP': 2,
    '5. PPD SCH LOSS': 5,
    '3. MED ONLY': 3,
    '6. PPD NSL': 6,
    '1. CANCELLED': 1,
    '8. DEATH': 8,
    '7. PTD': 7
}

y_train_encoded = y_train['Claim Injury Type'].map(claim_injury_type_mapping)
y_val_encoded = y_val['Claim Injury Type'].map(claim_injury_type_mapping)

> Separate columns in numerical and categorical

In [4]:
num_columns = ['Age at Injury', 'Average Weekly Wage', 'Birth Year', 'IME-4 Count', 'Number of Dependents', 'Days_between_Acc_Assembyl']

cat_columns = ['Alternative Dispute Resolution', 'Attorney/Representative', 'C-2 Date', 'C-3 Date', 'COVID-19 Indicator', 'Gender','First Hearing Date',
               'CarrierGroup_Self-insured Private Entity', 'CarrierGroup_Self-insured Public Entity', 'CarrierGroup_Special Funds',
               'CarrierGroup_State Insurance Fund', 'Industry Code_encoded_5. PPD SCH LOSS', 'Industry Code_encoded_2. NON-COMP', 
               'Industry Code_encoded_3. MED ONLY', 'Industry Code_encoded_4. TEMPORARY', 'Industry Code_encoded_1. CANCELLED', 
               'Industry Code_encoded_8. DEATH', 'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD',
               'Injury_Cause_Category_encoded_5. PPD SCH LOSS', 'Injury_Cause_Category_encoded_2. NON-COMP', 
               'Injury_Cause_Category_encoded_3. MED ONLY', 'Injury_Cause_Category_encoded_4. TEMPORARY', 
               'Injury_Cause_Category_encoded_1. CANCELLED', 'Injury_Cause_Category_encoded_8. DEATH','Injury_Cause_Category_encoded_6. PPD NSL',
                'Injury_Cause_Category_encoded_7. PTD', 'Nature_Injury_Occupational', 'Nature_Injury_Specific', 'Nature_Injury_Unknown',
                'Part_Body_Category_encoded_5. PPD SCH LOSS', 'Part_Body_Category_encoded_2. NON-COMP', 'Part_Body_Category_encoded_3. MED ONLY',
                'Part_Body_Category_encoded_4. TEMPORARY', 'Part_Body_Category_encoded_1. CANCELLED', 'Part_Body_Category_encoded_8. DEATH',
                'Part_Body_Category_encoded_6. PPD NSL', 'Part_Body_Category_encoded_7. PTD']


# Create subsets
X_train_num = X_train_encoded[num_columns]
X_train_cat = X_train_encoded[cat_columns]

X_val_num = X_val_encoded[num_columns]
X_val_cat = X_val_encoded[cat_columns]

X_test_num=X_test_encoded[num_columns]
X_test_cat=X_test_encoded[cat_columns]

## 2. Data scaling

### 2.1 Normalization

In [5]:
scaler = MinMaxScaler().fit(X_train_num)
X_train_num_scaled = scaler.transform(X_train_num)
print("Parameters fitted:")
for feature, min_val, max_val in zip(X_train_num.columns, scaler.data_min_, scaler.data_max_):
    print(f"Variable: {feature} | Min: {min_val} | Max: {max_val}")

# Convert the array to a pandas dataframe
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns = X_train_num.columns).set_index(X_train_encoded.index)
X_train_num_scaled.describe().round(2)

Parameters fitted:
Variable: Age at Injury | Min: 12.0 | Max: 88.0
Variable: Average Weekly Wage | Min: 17.41 | Max: 2284.46
Variable: Birth Year | Min: 1931.0 | Max: 2010.0
Variable: IME-4 Count | Min: 0.0 | Max: 39.0
Variable: Number of Dependents | Min: 0.0 | Max: 6.0
Variable: Days_between_Acc_Assembyl | Min: 0.0 | Max: 433.0


Unnamed: 0,Age at Injury,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Days_between_Acc_Assembyl
count,381339.0,381339.0,381339.0,381339.0,381339.0,381339.0
mean,0.4,0.5,0.6,0.02,0.5,0.06
std,0.18,0.17,0.17,0.05,0.33,0.12
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,0.36,0.46,0.0,0.17,0.01
50%,0.39,0.51,0.61,0.0,0.5,0.02
75%,0.55,0.56,0.75,0.0,0.83,0.05
max,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
X_val_num_scaled = scaler.transform(X_val_num)
X_val_num_scaled = pd.DataFrame(X_val_num_scaled, columns = X_val_num.columns).set_index(X_val_encoded.index)
X_val_num_scaled.describe().round(2)

Unnamed: 0,Age at Injury,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Days_between_Acc_Assembyl
count,163404.0,163404.0,163404.0,163404.0,163404.0,163404.0
mean,0.4,0.5,0.6,0.02,0.5,0.06
std,0.18,0.17,0.17,0.05,0.33,0.12
min,-0.01,-0.0,0.0,0.0,0.0,0.0
25%,0.25,0.36,0.46,0.0,0.17,0.01
50%,0.39,0.51,0.61,0.0,0.5,0.02
75%,0.55,0.56,0.75,0.0,0.83,0.05
max,1.0,1.0,1.01,0.87,1.0,1.0


In [7]:
X_test_num_scaled = scaler.transform(X_test_num)
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns = X_test_num.columns).set_index(X_test_encoded.index)
X_test_num_scaled.describe().round(2)

Unnamed: 0,Age at Injury,Average Weekly Wage,Birth Year,IME-4 Count,Number of Dependents,Days_between_Acc_Assembyl
count,387975.0,387975.0,387975.0,387975.0,387975.0,387975.0
mean,0.39,0.52,0.63,0.0,0.5,0.15
std,0.19,1.52,0.18,0.02,0.33,1.28
min,-0.16,-0.01,-0.41,0.0,0.0,-0.74
25%,0.24,0.42,0.48,0.0,0.17,0.01
50%,0.37,0.51,0.65,0.0,0.5,0.02
75%,0.54,0.54,0.78,0.0,0.83,0.05
max,1.34,860.28,1.18,0.46,1.0,48.09


### 2.2 Standarization

> Ver melhor

In [8]:
# from sklearn.preprocessing import StandardScaler

# std_scaler = StandardScaler().fit(X_train_num)
# X_train_scaled_std= scaler.transform(X_train_num)
# X_train_scaled_std = pd.DataFrame(X_train_scaled_std, columns = X_train_num.columns).set_index(X_train_encoded.index)

# X_val_scaled_std = scaler.transform(X_val_num)
# X_val_scaled_std = pd.DataFrame(X_val_scaled_std, columns = X_val_num.columns).set_index(X_val_encoded.index)


## 3. Feature Selection

> Numerical only kept IME-4 Count

> Retain all in categorical

In [None]:
# Remerge the numerical and categorical features
X_train = pd.concat([X_train_num[['IME-4 Count']], X_train_cat['']], axis=1)
X_val = pd.concat([X_val_num[['IME-4 Count']], X_val_cat], axis=1)
X_test =pd.concat([X_test_num[['IME-4 Count']], X_test_cat], axis=1)

## 4. Model building

> Não está em 1-D o y

In [10]:
# Define K-Fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

# Define Repeated K-Fold cross-validation
repeated_kfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, X_train, y_train, kfold, repeated_kfold):
    print(f"Evaluating model: {model}")
    
    # Evaluate using k-fold cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    print("K-Fold Accuracy:", scores.mean())

    # Fit the model and predict
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    
    print("Metrics on Training Data:")
    print("Accuracy:", accuracy_score(y_train, y_pred))
    print("Precision:", precision_score(y_train, y_pred, average='weighted'))
    print("Recall:", recall_score(y_train, y_pred, average='weighted'))
    print("F1 Score:", f1_score(y_train, y_pred, average='weighted'))


In [12]:
y_train = y_train_encoded
y_val = y_val_encoded

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Define models including the new ones
models = {
    # "MLPClassifier": MLPClassifier(max_iter=500),
    # "RandomForest": RandomForestClassifier(),
    # "LogisticRegression": LogisticRegression(max_iter=500), #aumentar numero iter
    # "KNeighborsClassifier": KNeighborsClassifier(),
    # "DecisionTree": DecisionTreeClassifier(),
    # "GaussianNB": GaussianNB(),
    # "RidgeClassifier": RidgeClassifier(),
    
    # Add XGBoost, LightGBM, and CatBoost
    # "XGBoost": XGBClassifier(eval_metric='mlogloss', use_label_encoder=False),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(silent=True)
}

# Evaluate each model using K-Fold and Repeated K-Fold cross-validation
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    evaluate_model(model, X_train, y_train, kfold, repeated_kfold)


Evaluating LightGBM...
Evaluating model: LGBMClassifier()
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.110388 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 305071, number of used features: 39
[LightGBM] [Info] Start training from score -3.927120
[LightGBM] [Info] Start training from score -0.648927
[LightGBM] [Info] Start training from score -2.110898
[LightGBM] [Info] Start training from score -1.382705
[LightGBM] [Info] Start training from score -2.555660
[LightGBM] [Info] Start training from score -4.941678
[LightGBM] [Info] Start training from score -8.821637
[LightGBM] [Info] Start training from score -7.216654
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.096444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 390
[LightGBM] [Info] Number o

> From this we weill decide on which models to pick.