In [156]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.linear_model import LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [157]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [158]:
train.set_index('Claim Identifier', inplace=True)
train.drop_duplicates(inplace=True)
train.drop('OIICS Nature of Injury Description', axis=1, inplace=True)

train.drop(columns=['Birth Year', 'Age at Injury', 'Number of Dependents', 'WCIO Cause of Injury Code'], inplace=True)

In [159]:
train_num = train.select_dtypes(include=np.number).columns.tolist()
train_cat = train.select_dtypes(exclude=np.number).columns.tolist()

In [160]:
# Numerical columns: Impute with mean
num_imputer = SimpleImputer(strategy="mean")
train[train_num] = pd.DataFrame(
    num_imputer.fit_transform(train[train_num]),
    columns=train_num,
    index=train.index
)

# Categorical columns: Impute with most frequent (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
train[train_cat] = pd.DataFrame(
    cat_imputer.fit_transform(train[train_cat]),
    columns=train_cat,
    index=train.index
)

In [161]:
X = train.drop(columns='Claim Injury Type')
y = train['Claim Injury Type']

In [162]:
X_num = X.select_dtypes(include=np.number).set_index(X.index)
X_cat = X.select_dtypes(exclude=np.number).set_index(X.index)

In [163]:
scaler = MinMaxScaler()
scaler.fit(X_num) #fit to training data
X_num_scaled = scaler.transform(X_num) # this will return an array
X_num_scaled = pd.DataFrame(X_num_scaled, columns = X_num.columns).set_index(X.index) # Convert the array to a pandas dataframe

In [164]:
label_encoders = {}
for col in X_cat.columns:
    le = LabelEncoder()
    le.fit(X_cat[col].astype(str))
    X_cat[col] = le.transform(X_cat[col].astype(str))
    label_encoders[col] = le

In [165]:
ordinal_encoder = OrdinalEncoder()
y_encoded = ordinal_encoder.fit_transform(y.values.reshape(-1, 1))

In [166]:
X_combined = pd.concat([X_num_scaled, X_cat], axis=1)
y_encoded_df = pd.DataFrame(y_encoded, columns=['Claim Injury Type'])

In [167]:
def run_model_LR(X,y):
    model = LogisticRegression().fit(X,y)
    return model

In [168]:
def evaluate_model(X,y, model):
    y_pred = model.predict(X)
    return f1_score(y, y_pred, average='macro')

In [169]:
def avg_score_LR(method,X,y):
    score_train = []
    score_test = []
    for train_index, test_index in method.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model = run_model_LR(X_train, y_train)
        value_train = evaluate_model(X_train, y_train, model)
        value_test = evaluate_model(X_test,y_test, model)
        score_train.append(value_train)
        score_test.append(value_test)

    print('Train:', np.mean(score_train))
    print('Test:', np.mean(score_test))

In [170]:
kf = KFold(n_splits=10) #if the splits are too many, poor efficiency

In [171]:
avg_score_LR(kf, X_combined, y_encoded_df)

Train: 0.10491548496550632
Test: 0.09816413934303775


In [172]:
rkf = RepeatedKFold(n_splits=6, n_repeats=2)
avg_score_LR(rkf, X_combined, y_encoded_df)

Train: 0.10389641879963367
Test: 0.10390441940548605


In [173]:
""" loo = LeaveOneOut()
avg_score_LR(loo, X_combined, y_encoded_df) """

' loo = LeaveOneOut()\navg_score_LR(loo, X_combined, y_encoded_df) '

In [174]:
def run_model_DT(X,y):
    model = DecisionTreeClassifier().fit(X,y)
    return model

In [175]:
def evaluate_model_DT(X,y, model):
    y_pred = model.predict(X)
    return f1_score(y, y_pred, average='macro')

In [176]:
# DO IT
def avg_score_DT(method,X,y):
    score_train = []
    score_test = []
    for train_index, test_index in method.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model = run_model_DT(X_train, y_train)
        value_train = evaluate_model_DT(X_train, y_train, model)
        value_test = evaluate_model_DT(X_test,y_test, model)
        score_train.append(value_train)
        score_test.append(value_test)

    print('Train:', np.mean(score_train))
    print('Test:', np.mean(score_test))

In [177]:
kf2 = KFold(n_splits=10) #if the splits are too many, poor efficiency
avg_score_DT(kf2, X_combined, y_encoded_df)

Train: 0.9999715872424831
Test: 0.3564713888449257


In [178]:
rkf2 = RepeatedKFold(n_splits=6, n_repeats=2)
avg_score_DT(rkf2, X_combined, y_encoded_df)

Train: 0.9999761794181835
Test: 0.3867963587253302


In [179]:
""" loo2 = LeaveOneOut()
avg_score_DT(loo2, X_combined, y_encoded_df) """

' loo2 = LeaveOneOut()\navg_score_DT(loo2, X_combined, y_encoded_df) '