In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.linear_model import LassoCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, make_scorer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [3]:
train.set_index('Claim Identifier', inplace=True)
train.drop_duplicates(inplace=True)
train.drop('OIICS Nature of Injury Description', axis=1, inplace=True)

#train.drop(columns=['Birth Year', 'Age at Injury', 'Number of Dependents', 'WCIO Cause of Injury Code'], inplace=True)

In [4]:
train_num = train.select_dtypes(include=np.number).columns.tolist()
train_cat = train.select_dtypes(exclude=np.number).columns.tolist()

# Numerical columns: Impute with mean
num_imputer = SimpleImputer(strategy="mean")
train[train_num] = pd.DataFrame(
    num_imputer.fit_transform(train[train_num]),
    columns=train_num,
    index=train.index
)

# Categorical columns: Impute with most frequent (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
train[train_cat] = pd.DataFrame(
    cat_imputer.fit_transform(train[train_cat]),
    columns=train_cat,
    index=train.index
)

In [5]:
X = train.drop('Claim Injury Type', axis=1)
y = train['Claim Injury Type']

In [6]:
X_num = X.select_dtypes(include=np.number).set_index(X.index)
X_cat = X.select_dtypes(exclude=np.number).set_index(X.index)

In [7]:
min_max = MinMaxScaler()
min_max.fit(X_num) #fit to training data
X_num_scaled_min_max = min_max.transform(X_num) # this will return an array
X_num_scaled_min_max = pd.DataFrame(X_num_scaled_min_max, columns = X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

min_max2 = MinMaxScaler(feature_range=(-1, 1))
min_max2.fit(X_num) #fit to training data
X_num_scaled_min_max2 = min_max2.transform(X_num) # this will return an array
X_num_scaled_min_max2 = pd.DataFrame(X_num_scaled_min_max2, columns = X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

# StandardScaler
standard_scaler = StandardScaler()
standard_scaler.fit(X_num) # fit to training data
X_num_scaled_standard = standard_scaler.transform(X_num) # this will return an array
X_num_scaled_standard = pd.DataFrame(X_num_scaled_standard, columns=X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

# RobustScaler
robust_scaler = RobustScaler()
robust_scaler.fit(X_num) # fit to training data
X_num_scaled_robust = robust_scaler.transform(X_num) # this will return an array
X_num_scaled_robust = pd.DataFrame(X_num_scaled_robust, columns=X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

In [8]:
X_cat = X_cat.astype(str)

enc1 = OrdinalEncoder() #encoder for features
enc2 = LabelEncoder() #encoder for labels
enc1.fit(X_cat)
X_cat_encoded = pd.DataFrame(enc1.transform(X_cat), columns = X_cat.columns).set_index(X.index)
y_encoded = enc2.fit_transform(y)

In [9]:
X_minmax = pd.concat([X_num_scaled_min_max, X_cat_encoded], axis=1)
X_minmax2 = pd.concat([X_num_scaled_min_max2, X_cat_encoded], axis=1)
X_standard = pd.concat([X_num_scaled_standard, X_cat_encoded], axis=1)
X_robust = pd.concat([X_num_scaled_robust, X_cat_encoded], axis=1)
y_encoded_df = pd.DataFrame(y_encoded, columns=['Claim Injury Type'])

In [10]:
def run_model(X,y, model):
    return model.fit(X,y)

In [11]:
def evaluate_model(X,y, model):
    y_pred = model.predict(X)
    return f1_score(y, y_pred, average='macro')

In [12]:
def avg_f1_score(X,y,model, method=None):
    score_train = []
    score_test = []
    if isinstance(model, CategoricalNB) and method is not None: #CategoricalNB does not support kfold, we need to stratify
        return
    if method is None:
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, 
                                                  random_state = 0, 
                                                  stratify = y, 
                                                  shuffle = True)
        model = run_model(X_train, y_train, model)
        value_train = evaluate_model(X_train, y_train, model)
        value_test = evaluate_model(X_test, y_test, model)
        print('Train:', value_train)
        print('Test:', value_test)
    elif isinstance(method, StratifiedKFold):
        for train_index, test_index in method.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            model = run_model(X_train, y_train, model)
            value_train = evaluate_model(X_train, y_train, model)
            value_test = evaluate_model(X_test, y_test, model)
            score_train.append(value_train)
            score_test.append(value_test)

        print('Train:', np.mean(score_train))
        print('Test:', np.mean(score_test))
    else:
        for train_index, test_index in method.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            model = run_model(X_train, y_train, model)
            value_train = evaluate_model(X_train, y_train, model)
            value_test = evaluate_model(X_test, y_test, model)
            score_train.append(value_train)
            score_test.append(value_test)

        print('Train:', np.mean(score_train))
        print('Test:', np.mean(score_test))

In [13]:
kf = KFold(n_splits=10) #if the splits are too many, poor efficiency
rkf = RepeatedKFold(n_splits=6, n_repeats=2)  
skf = StratifiedKFold(n_splits=10)  
normal_split = None

In [14]:
dt = DecisionTreeClassifier()
log = LogisticRegression()
cnb = CategoricalNB()
gnb = GaussianNB(var_smoothing=0.0001)
knn = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=19) #kd_tree is faster for large datasets and n_neighbors is the best value for this dataset
mlp = MLPClassifier()

#start changing hyperparameters

models = [dt, log, gnb, mlp] #knn
scaler = ['minmax', 'standard', 'robust']
data_scaled = [X_minmax, X_minmax2, X_standard, X_robust]
for data, s in zip(data_scaled, scaler):
    print("Scaler: ", s)
    for model in models:
        print("Model: ", model)
        for method in [rkf, normal_split]:
            print("Method: ", method)
            avg_f1_score(data, y_encoded_df, model, method)

Scaler:  minmax
Model:  DecisionTreeClassifier()
Method:  RepeatedKFold(n_repeats=2, n_splits=6, random_state=None)
Train: 0.9999958208355446
Test: 0.38330202502938454
Method:  None
Train: 1.0
Test: 0.3815760263088144
Model:  LogisticRegression()
Method:  RepeatedKFold(n_repeats=2, n_splits=6, random_state=None)
Train: 0.10398471934623876
Test: 0.10397681538161746
Method:  None
Train: 0.10529113225531689
Test: 0.10559662374073271
Model:  GaussianNB(var_smoothing=0.0001)
Method:  RepeatedKFold(n_repeats=2, n_splits=6, random_state=None)
Train: 0.18179852576293878
Test: 0.18157608473218756
Method:  None
Train: 0.1825618165171516
Test: 0.17991583799629762
Model:  MLPClassifier()
Method:  RepeatedKFold(n_repeats=2, n_splits=6, random_state=None)
Train: 0.27024997767728587
Test: 0.2695480323951222
Method:  None
Train: 0.24170397868077245
Test: 0.2392474912379419
Scaler:  standard
Model:  DecisionTreeClassifier()
Method:  RepeatedKFold(n_repeats=2, n_splits=6, random_state=None)
Train: 0.999

In [None]:
#não consigo usar o StratifiedKFold com o CategoricalNB, dá erro e não sei como resolver
#imputing
#scaling
#encoding

#Decision Tree overfits a lot, from 0.99 to 0.38
#Logistic Regression is more stable, don't overfit but as poor results, from 0.10 to 0.10
#using all columns or doing the feature selection doesn't change the results

#starting using Kfold, repeated Kfold and normal split
#no better results, Decision Tree still overfits a lot, Logistic Regression still poor results

#adding CategoricalNB, GaussianNB and KNeighborsClassifier
#CategoricalNB 0.32
#GaussianNB 0.18
#KNeighborsClassifier have just 0.23

#imputing with KnnImputer takes too long, similar to using KNeighborsClassifier
#maybe just using the imputer when it makes sense

#final results:
#the best is Decision Tree, 0.38
#Logistic Regression is the worst, 0.10
#CategoricalNB is 0.32 not so good as Decision Tree but don't overfit too much
#GaussianNB is 0.18
#MLPClassifier is 0.33

#the best results is using Decision Tree with any scaler and any method
#CategoricalNB and MPL Classifier with roubust scaler presents good results and don't overfit