In [105]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.linear_model import LassoCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, make_scorer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [106]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [107]:
train.set_index('Claim Identifier', inplace=True)
train.drop_duplicates(inplace=True)
train.drop('OIICS Nature of Injury Description', axis=1, inplace=True)

In [108]:
train['WCIO Part Of Body Code'] = train['WCIO Part Of Body Code'].apply(lambda x: 0 if x < 0 else x)
## IN DATE
date_cols = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date'] 
for col in date_cols:
    train[col] = pd.to_datetime(train[col], errors='coerce')
    test[col] = pd.to_datetime(test[col], errors='coerce')
    
# IN INT
int_cols = ['Age at Injury', 'Birth Year', 'IME-4 Count', 'Number of Dependents']
for col in int_cols:
    train[col] = train[col].astype('Int64')

In [109]:
train.dropna(subset=['Claim Injury Type'], inplace=True)

In [110]:
train['IME-4 Count'] = train['IME-4 Count'].fillna(0)

In [111]:
float_to_object = ['Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']
train[float_to_object] = train[float_to_object].astype('object')

In [112]:
train.drop(columns=['Birth Year', 'Age at Injury', 'Number of Dependents', 'WCIO Cause of Injury Code'], inplace=True)

In [113]:
train_num = train.select_dtypes(include=np.number).columns.tolist()
train_cat = train.select_dtypes(exclude=np.number).columns.tolist()

# Numerical columns: Impute with mean
num_imputer = SimpleImputer(strategy="mean")
train[train_num] = pd.DataFrame(
    num_imputer.fit_transform(train[train_num]),
    columns=train_num,
    index=train.index
)

# Categorical columns: Impute with most frequent (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
train[train_cat] = pd.DataFrame(
    cat_imputer.fit_transform(train[train_cat]),
    columns=train_cat,
    index=train.index
)

In [114]:
# Function to calculate IQR and identify outliers for a specific column
def identify_outliers_iqr_column(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
    return outliers

In [115]:
not_voluntary_train = train['Average Weekly Wage'] != 0

# Identify outliers for the 'Average Weekly Wage' column
outliers = identify_outliers_iqr_column(train[not_voluntary_train], 'Average Weekly Wage')
train_cleaned = train[~(not_voluntary_train & outliers)]

In [116]:
X = train_cleaned.drop('Claim Injury Type', axis=1)
y = train_cleaned['Claim Injury Type']

In [117]:
X_num = X.select_dtypes(include=np.number).set_index(X.index)
X_cat = X.select_dtypes(exclude=np.number).set_index(X.index)

In [118]:
min_max = MinMaxScaler()
min_max.fit(X_num) #fit to training data
X_num_scaled_min_max = min_max.transform(X_num) # this will return an array
X_num_scaled_min_max = pd.DataFrame(X_num_scaled_min_max, columns = X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

min_max2 = MinMaxScaler(feature_range=(-1, 1))
min_max2.fit(X_num) #fit to training data
X_num_scaled_min_max2 = min_max2.transform(X_num) # this will return an array
X_num_scaled_min_max2 = pd.DataFrame(X_num_scaled_min_max2, columns = X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

# StandardScaler
standard_scaler = StandardScaler()
standard_scaler.fit(X_num) # fit to training data
X_num_scaled_standard = standard_scaler.transform(X_num) # this will return an array
X_num_scaled_standard = pd.DataFrame(X_num_scaled_standard, columns=X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

# RobustScaler
robust_scaler = RobustScaler()
robust_scaler.fit(X_num) # fit to training data
X_num_scaled_robust = robust_scaler.transform(X_num) # this will return an array
X_num_scaled_robust = pd.DataFrame(X_num_scaled_robust, columns=X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

In [119]:
X_cat = X_cat.astype(str)

enc1 = OrdinalEncoder() #encoder for features
enc2 = LabelEncoder() #encoder for labels
enc1.fit(X_cat)
X_cat_encoded = pd.DataFrame(enc1.transform(X_cat), columns = X_cat.columns).set_index(X.index)
y_encoded = enc2.fit_transform(y)


In [120]:
#X_minmax = pd.concat([X_num_scaled_min_max, X_cat_encoded], axis=1)
#X_minmax2 = pd.concat([X_num_scaled_min_max2, X_cat_encoded], axis=1)
#X_standard = pd.concat([X_num_scaled_standard, X_cat_encoded], axis=1)
X_robust = pd.concat([X_num_scaled_robust, X_cat_encoded], axis=1)
y_encoded_df = pd.DataFrame(y_encoded, columns=['Claim Injury Type'])

In [121]:
def run_model(X,y, model):
    return model.fit(X,y)

In [122]:
def evaluate_model(X,y, model):
    y_pred = model.predict(X)
    return f1_score(y, y_pred, average='macro')

In [123]:
def avg_f1_score(X,y,model, method=None):
    score_train = []
    score_test = []
    if method is None:
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, 
                                                  random_state = 0, 
                                                  stratify = y, 
                                                  shuffle = True)
        model = run_model(X_train, y_train, model)
        value_train = evaluate_model(X_train, y_train, model)
        value_test = evaluate_model(X_test, y_test, model)
        print('Train:', value_train)
        print('Test:', value_test)
    elif isinstance(method, StratifiedKFold):
        for train_index, test_index in method.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            model = run_model(X_train, y_train, model)
            value_train = evaluate_model(X_train, y_train, model)
            value_test = evaluate_model(X_test, y_test, model)
            score_train.append(value_train)
            score_test.append(value_test)

        print('Train:', np.mean(score_train))
        print('Test:', np.mean(score_test))
    else:
        for train_index, test_index in method.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            model = run_model(X_train, y_train, model)
            value_train = evaluate_model(X_train, y_train, model)
            value_test = evaluate_model(X_test, y_test, model)
            score_train.append(value_train)
            score_test.append(value_test)

        print('Train:', np.mean(score_train))
        print('Test:', np.mean(score_test))

In [124]:
kf = KFold(n_splits=10) #if the splits are too many, poor efficiency
rkf = RepeatedKFold(n_splits=6, n_repeats=2)  
skf = StratifiedKFold(n_splits=10)  
normal_split = None

In [125]:
print('Decision Tree')#0.99/0.39
dt = DecisionTreeClassifier()
avg_f1_score(X_robust, y_encoded_df, dt)

Decision Tree
Train: 0.9999743737183131
Test: 0.3935725793952795


In [126]:
from xgboost import XGBClassifier #0.64/0.45
X_train, X_test, y_train, y_test = train_test_split(X_robust,y_encoded_df, test_size = 0.3, 
                                                  random_state = 0, 
                                                  stratify = y_encoded_df, 
                                                  shuffle = True)
xg = XGBClassifier(
            n_estimators=250,
            learning_rate=0.1,
            max_depth=6,
            random_state=42,
            n_jobs=2,
            tree_method='hist',
            enable_categorical=True,
            objective='multi:softprob',
            num_class=8,
            eval_metric=['mlogloss', 'merror'],
            use_label_encoder=False
        )

# Train with early stopping
eval_set = [(X_train, y_train)]
xg.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=False
)
value_train = evaluate_model(X_train, y_train, xg)
value_test = evaluate_model(X_test, y_test, xg)
print('Train:', value_train)
print('Test:', value_test)

Train: 0.6468499867648003
Test: 0.4588112705230834


In [127]:
""" print('MLP')
mlp = MLPClassifier()
avg_f1_score(X_robust, y_encoded_df, mlp, rkf) """

" print('MLP')\nmlp = MLPClassifier()\navg_f1_score(X_robust, y_encoded_df, mlp, rkf) "

In [128]:
""" X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_cat_encoded,y_encoded_df, test_size = 0.3, 
                                                  random_state = 0, 
                                                  stratify = y_encoded_df, 
                                                  shuffle = True)

X_train_num, X_test_num, y_train_num, y_test_num = train_test_split(X_num_scaled_min_max,y_encoded_df, test_size = 0.3, 
                                                  random_state = 0, 
                                                  stratify = y_encoded_df, 
                                                  shuffle = True) """

' X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_cat_encoded,y_encoded_df, test_size = 0.3, \n                                                  random_state = 0, \n                                                  stratify = y_encoded_df, \n                                                  shuffle = True)\n\nX_train_num, X_test_num, y_train_num, y_test_num = train_test_split(X_num_scaled_min_max,y_encoded_df, test_size = 0.3, \n                                                  random_state = 0, \n                                                  stratify = y_encoded_df, \n                                                  shuffle = True) '

In [129]:
""" print('Naive Bayes') #0.37/0.30
cnb = CategoricalNB(alpha= 0.1)
gnb = GaussianNB(var_smoothing=1e-7)
model_cat = run_model(X_train_cat, y_train_cat, cnb)
model_num = run_model(X_train_num, y_train_num, gnb)
# Obter probabilidades de previsão
prob_cat_train = model_cat.predict_proba(X_train_cat)
prob_num_train = model_num.predict_proba(X_train_num)
prob_cat_test = model_cat.predict_proba(X_test_cat)
prob_num_test = model_num.predict_proba(X_test_num)
# Combinar probabilidades (média)
prob_combined_train = (prob_cat_train + prob_num_train) / 2
prob_combined_test = (prob_cat_test + prob_num_test) / 2
# Predizer classe final
y_pred_combined_train = np.argmax(prob_combined_train, axis=1)
y_pred_combined_test = np.argmax(prob_combined_test, axis=1)
# Avaliar o modelo combinado
print('Train:', f1_score(y_train_num, y_pred_combined_train, average='macro'))# y_test_num ou y_test_cat são os mesmos
print('Test:', f1_score(y_test_num, y_pred_combined_test, average='macro')) """

" print('Naive Bayes') #0.37/0.30\ncnb = CategoricalNB(alpha= 0.1)\ngnb = GaussianNB(var_smoothing=1e-7)\nmodel_cat = run_model(X_train_cat, y_train_cat, cnb)\nmodel_num = run_model(X_train_num, y_train_num, gnb)\n# Obter probabilidades de previsão\nprob_cat_train = model_cat.predict_proba(X_train_cat)\nprob_num_train = model_num.predict_proba(X_train_num)\nprob_cat_test = model_cat.predict_proba(X_test_cat)\nprob_num_test = model_num.predict_proba(X_test_num)\n# Combinar probabilidades (média)\nprob_combined_train = (prob_cat_train + prob_num_train) / 2\nprob_combined_test = (prob_cat_test + prob_num_test) / 2\n# Predizer classe final\ny_pred_combined_train = np.argmax(prob_combined_train, axis=1)\ny_pred_combined_test = np.argmax(prob_combined_test, axis=1)\n# Avaliar o modelo combinado\nprint('Train:', f1_score(y_train_num, y_pred_combined_train, average='macro'))# y_test_num ou y_test_cat são os mesmos\nprint('Test:', f1_score(y_test_num, y_pred_combined_test, average='macro')) "