In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.linear_model import LassoCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [3]:
train.set_index('Claim Identifier', inplace=True)
train.drop_duplicates(inplace=True)
train.drop('OIICS Nature of Injury Description', axis=1, inplace=True)

#train.drop(columns=['Birth Year', 'Age at Injury', 'Number of Dependents', 'WCIO Cause of Injury Code'], inplace=True)

In [4]:
train_num = train.select_dtypes(include=np.number).columns.tolist()
train_cat = train.select_dtypes(exclude=np.number).columns.tolist()

In [5]:
# Numerical columns: Impute with mean
num_imputer = SimpleImputer(strategy="mean")
train[train_num] = pd.DataFrame(
    num_imputer.fit_transform(train[train_num]),
    columns=train_num,
    index=train.index
)

# Categorical columns: Impute with most frequent (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
train[train_cat] = pd.DataFrame(
    cat_imputer.fit_transform(train[train_cat]),
    columns=train_cat,
    index=train.index
)

In [6]:
X = train.drop(columns='Claim Injury Type')
y = train['Claim Injury Type']

In [7]:
X_num = X.select_dtypes(include=np.number).set_index(X.index)
X_cat = X.select_dtypes(exclude=np.number).set_index(X.index)

In [8]:
scaler = MinMaxScaler()
scaler.fit(X_num) #fit to training data
X_num_scaled = scaler.transform(X_num) # this will return an array
X_num_scaled = pd.DataFrame(X_num_scaled, columns = X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

In [9]:
X_cat = X_cat.astype(str)

enc1 = OrdinalEncoder() #encoder for features
enc2 = LabelEncoder() #encoder for labels
enc1.fit(X_cat)
X_cat_encoded = pd.DataFrame(enc1.transform(X_cat), columns = X_cat.columns).set_index(X.index)
y_encoded = enc2.fit_transform(y)

In [10]:
X = pd.concat([X_num_scaled, X_cat_encoded], axis=1)
y = pd.DataFrame(y_encoded, columns=['Claim Injury Type'])

In [11]:
""" X_train, X_val, y_train, y_val = train_test_split(X_combined, y_encoded_df, test_size=0.25, stratify = y_encoded_df, random_state=5) """

' X_train, X_val, y_train, y_val = train_test_split(X_combined, y_encoded_df, test_size=0.25, stratify = y_encoded_df, random_state=5) '

In [12]:
""" model = MLPClassifier().fit(X_train, y_train) """

' model = MLPClassifier().fit(X_train, y_train) '

In [13]:
""" pred_val = model.predict(X_val)
print('F1 Score:' ,f1_score(y_val, pred_val, average='macro'))
print('Mean Accuracy:' ,model.score(X_val, y_val)) """

" pred_val = model.predict(X_val)\nprint('F1 Score:' ,f1_score(y_val, pred_val, average='macro'))\nprint('Mean Accuracy:' ,model.score(X_val, y_val)) "

In [14]:
import time


def avg_score(model):
    # apply kfold
    skf = StratifiedKFold(n_splits=10)
    # create lists to store the results from the different models 
    score_train = []
    score_val = []
    timer = []
    n_iter = []
    
    for train_index, val_index in skf.split(X,y):
        # get the indexes of the observations assigned for each partition
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        # start counting time
        begin = time.perf_counter()
        # fit the model to the data
        model.fit(X_train, y_train)
        # finish counting time
        end = time.perf_counter()

        pred_train = model.predict(X_train)
        pred_val = model.predict(X_val)
        # check the mean accuracy for the train
        value_train = f1_score(y_train, pred_train, average='macro')
        # check the mean accuracy for the validation
        value_val = f1_score(y_val, pred_val, average='macro')
        # append the accuracies, the time and the number of iterations in the corresponding list
        score_train.append(value_train)
        score_val.append(value_val)
        timer.append(end-begin)
        n_iter.append(model.n_iter_)
    # calculate the average and the std for each measure (accuracy, time and number of iterations)
    avg_time = round(np.mean(timer),3)
    avg_train = round(np.mean(score_train),3)
    avg_val = round(np.mean(score_val),3)
    std_time = round(np.std(timer),2)
    std_train = round(np.std(score_train),2)
    std_val = round(np.std(score_val),2)
    avg_iter = round(np.mean(n_iter),1)
    std_iter = round(np.std(n_iter),1)
    
    return str(avg_time) + '+/-' + str(std_time), str(avg_train) + '+/-' + str(std_train),\
str(avg_val) + '+/-' + str(std_val), str(avg_iter) + '+/-' + str(std_iter)

def show_results(df, *args):
    """
    Receive an empty dataframe and the different models and call the function avg_score
    """
    count = 0
    # for each model passed as argument
    for arg in args:
        # obtain the results provided by avg_score
        time, avg_train, avg_val, avg_iter = avg_score(arg)
        # store the results in the right row
        df.iloc[count] = time, avg_train, avg_val, avg_iter
        count+=1
    return df

In [15]:
model_simple = MLPClassifier(hidden_layer_sizes=(1))
#model_medium = MLPClassifier(hidden_layer_sizes=(8))
#model_complex = MLPClassifier(hidden_layer_sizes=(100,100,100,100))

In [16]:
df = pd.DataFrame(columns = ['Time','Train','Validation', 'Iterations'], index = ['Simple'])
show_results(df, model_simple)

Unnamed: 0,Time,Train,Validation,Iterations
Simple,27.591+/-2.28,0.084+/-0.0,0.084+/-0.0,17.1+/-0.3


In [17]:
""" model_maxiter_20 = MLPClassifier(max_iter = 20, hidden_layer_sizes=(8)) """

' model_maxiter_20 = MLPClassifier(max_iter = 20, hidden_layer_sizes=(8)) '