In [150]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.linear_model import LassoCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, make_scorer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [151]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [152]:
train.set_index('Claim Identifier', inplace=True)
train.drop_duplicates(inplace=True)
train.drop('OIICS Nature of Injury Description', axis=1, inplace=True)

In [153]:
train['WCIO Part Of Body Code'] = train['WCIO Part Of Body Code'].apply(lambda x: 0 if x < 0 else x)
## IN DATE
date_cols = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date'] 
for col in date_cols:
    train[col] = pd.to_datetime(train[col], errors='coerce')
    test[col] = pd.to_datetime(test[col], errors='coerce')
    
# IN INT
int_cols = ['Age at Injury', 'Birth Year', 'IME-4 Count', 'Number of Dependents']
for col in int_cols:
    train[col] = train[col].astype('Int64')

In [154]:
train.dropna(subset=['Claim Injury Type'], inplace=True)

In [155]:
train['IME-4 Count'] = train['IME-4 Count'].fillna(0)

In [156]:
float_to_object = ['Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']
train[float_to_object] = train[float_to_object].astype('object')

In [157]:
train.drop(columns=['Birth Year', 'Age at Injury', 'Number of Dependents', 'WCIO Cause of Injury Code'], inplace=True)

In [158]:
pd.set_option('display.max_columns', None)

In [159]:
train.head(2)

Unnamed: 0_level_0,Accident Date,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
5393875,2019-12-30,N,2020-01-01,N,0.0,2019-12-31,NaT,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,2. NON-COMP,ST. LAWRENCE,N,SYRACUSE,NaT,M,0,44.0,RETAIL TRADE,I,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662,0.0,Not Work Related
5393091,2019-08-30,N,2020-01-01,Y,1745.93,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,1A. PRIVATE,4. TEMPORARY,WYOMING,N,ROCHESTER,2020-02-21,F,4,23.0,CONSTRUCTION,I,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569,1.0,Not Work Related


In [160]:
train = train.drop(columns=train.filter(like='Date').columns)

In [161]:
X = train.drop('Claim Injury Type', axis=1)
y = train['Claim Injury Type']  

In [162]:
kf = KFold(n_splits=10) #if the splits are too many, poor efficiency
rkf = RepeatedKFold(n_splits=6, n_repeats=2)  
skf = StratifiedKFold(n_splits=10)  
normal_split = None

In [163]:
def split_data(X, y, method=None):
    if method is None:
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, 
                                                random_state = 0, 
                                                stratify = y, 
                                                shuffle = True)
    elif isinstance(method, StratifiedKFold):
        for train_index, test_index in method.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    else:
        for train_index, test_index in method.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    X_train_num = X_train.select_dtypes(include=np.number)
    X_test_num = X_test.select_dtypes(include=np.number)
    X_train_cat = X_train.select_dtypes(exclude=np.number)
    X_test_cat = X_test.select_dtypes(exclude=np.number)

    return X_train_num, X_test_num, X_train_cat, X_test_cat, y_train, y_test

In [164]:
def imputing(X_train_num, X_test_num, X_train_cat, X_test_cat):
    # Numéricos: Imputação com média
    num_imputer = SimpleImputer(strategy="mean")
    X_train_num = pd.DataFrame(num_imputer.fit_transform(X_train_num), columns=X_train_num.columns)
    X_test_num = pd.DataFrame(num_imputer.transform(X_test_num), columns=X_test_num.columns)

    # Categóricos: Imputação com moda
    cat_imputer = SimpleImputer(strategy="most_frequent")
    X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train_cat), columns=X_train_cat.columns)
    X_test_cat = pd.DataFrame(cat_imputer.transform(X_test_cat), columns=X_test_cat.columns)

    return X_train_num, X_test_num, X_train_cat, X_test_cat

In [165]:
# Function to calculate IQR and identify outliers for a specific column
def identify_outliers_iqr_column(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
    return outliers

In [166]:
def outliers(X_train_num, y_train):
    not_voluntary = X_train_num['Average Weekly Wage'] != 0
    not_voluntary_df = X_train_num[not_voluntary]
    
    outliers_mask = identify_outliers_iqr_column(not_voluntary_df, 'Average Weekly Wage')
    outliers_indices = not_voluntary_df[outliers_mask].index

    X_train_num = X_train_num.drop(index=outliers_indices, errors='ignore')
    y_train = y_train.drop(index=outliers_indices, errors='ignore')
    
    return X_train_num, y_train

In [167]:
min_max = MinMaxScaler()
min_max2 = MinMaxScaler(feature_range=(-1, 1))
standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

In [168]:
def scaling(X_train_num, X_test_num, scaler):
    scaler.fit(X_train_num)  # Ajusta o escalonador aos dados de treino
    X_train_num_scaled = pd.DataFrame(scaler.transform(X_train_num), columns=X_train_num.columns)
    X_test_num_scaled = pd.DataFrame(scaler.transform(X_test_num), columns=X_test_num.columns)

    return X_train_num_scaled, X_test_num_scaled

In [169]:
oneHot = OneHotEncoder()
ordinal = OrdinalEncoder()
label = LabelEncoder()

In [170]:
def encoding_independent(X_train_cat, X_test_cat, encoder):
    X_train_cat = X_train_cat.astype(str)
    X_test_cat = X_test_cat.astype(str)
    
    encoder.fit(X_train_cat)
    X_train_cat_encoded = pd.DataFrame(
        encoder.transform(X_train_cat), columns=encoder.get_feature_names_out()
    )
    X_test_cat_encoded = pd.DataFrame(
        encoder.transform(X_test_cat), columns=encoder.get_feature_names_out()
    )

    return X_train_cat_encoded, X_test_cat_encoded

In [171]:
def encoding_dependent(y_train, y_test, encoder):
    encoder.fit(y_train)
    y_train_encoded = pd.Series(encoder.transform(y_train))
    y_test_encoded = pd.Series(encoder.transform(y_test))

    return y_train_encoded, y_test_encoded

In [172]:
def run_model(X,y, model):
    return model.fit(X,y)

In [173]:
def evaluate_model(X,y, model):
    y_pred = model.predict(X)
    return f1_score(y, y_pred, average='macro')

In [174]:
def pipeline(X, y, method, scaler, encoder_independent, encoder_dependent, model):
    X_train_num, X_test_num, X_train_cat, X_test_cat, y_train, y_test = split_data(X, y, method)
    print("Split data OK.")
    X_train_num, X_test_num, X_train_cat, X_test_cat = imputing(X_train_num, X_test_num, X_train_cat, X_test_cat)
    print("Imputing OK.")
    X_train_num, y_train = outliers(X_train_num, y_train)
    print("Outliers OK.")
    X_train_num_scaled, X_test_num_scaled = scaling(X_train_num, X_test_num, scaler)
    print("Scaling OK.")
    X_train_cat_encoded, X_test_cat_encoded = encoding_independent(X_train_cat, X_test_cat, encoder_independent)
    print("Encoding independent OK.")
    y_train_encoded, y_test_encoded = encoding_dependent(y_train, y_test, encoder_dependent)
    print("Encoding dependent OK.")

    X_train = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
    X_test = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis=1)
    print("Concatenating OK.")

    model = run_model(X_train, y_train_encoded, model)
    print("Model OK.")
    f1 = evaluate_model(X_test, y_test_encoded, model)
    print("Evaluation OK.")

    return f1

In [175]:
print('Decision Tree')#0.99/0.39
dt = DecisionTreeClassifier()
result = pipeline(X, y, skf, robust_scaler, ordinal, label, dt)
print(f"F1 Score: {result}")

Decision Tree
Split data OK.
Imputing OK.
Outliers OK.
Scaling OK.


ValueError: Found unknown categories ['TRAVELERS CASUALTY AND SURETY', 'NATIONAL FUEL GAS SUPP CORP', 'SECURITY INS CO OF HARTFORD', 'MID-HUDSON VALLEY STAFFCO, LL', 'BOCES DISTRICT OF ORANGE &', 'NATIONWIDE MUTUAL INSURANCE', 'PENN MILLERS INSURANCE CO', 'GREENPORT UFSD', 'GEORGIA PACIFIC CORPORATION', 'NATIONAL FUEL GAS DIST CORP', 'WHEATLAND CHILI CENTRAL', 'DUNDEE CENTRAL SCHOOL DIST', 'NISKAYUNA TOWN OF', 'KEENE CENTRAL SCHOOL DIST', 'KESHEQUA CSD', 'GENERAL BROWN CSD', 'MOUNTAIN VALLEY INDEMNITY CO', 'NATIONWIDE ASSURANCE COMPANY', 'POWER AUTHORITY OF THE STATE', 'HERMON-DEKALB CENTRAL', 'CHAZY CENTRAL SCHOOL DISTRICT', 'MIDWEST EMPLOYERS CAS. CO.', 'EDMESTON CENTRAL', 'GALWAY CENTRAL SCHOOL DIST', 'PHELPS-CLIFTON SPRINGS CENTRAL', 'MARATHON CENTRAL SCHOOL DIST', 'GREENE COUNTY', 'CAMBRIDGE CENTRAL SCHOOL', 'BROOKFIELD CENTRAL SCHOOL'] in column 2 during transform

In [None]:
""" from xgboost import XGBClassifier #0.64/0.45
X_train, X_test, y_train, y_test = train_test_split(X_robust,y_encoded_df, test_size = 0.3, 
                                                  random_state = 0, 
                                                  stratify = y_encoded_df, 
                                                  shuffle = True)
xg = XGBClassifier(
            n_estimators=250,
            learning_rate=0.1,
            max_depth=6,
            random_state=42,
            n_jobs=2,
            tree_method='hist',
            enable_categorical=True,
            objective='multi:softprob',
            num_class=8,
            eval_metric=['mlogloss', 'merror'],
            use_label_encoder=False
        )

# Train with early stopping
eval_set = [(X_train, y_train)]
xg.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=False
)
value_train = evaluate_model(X_train, y_train, xg)
value_test = evaluate_model(X_test, y_test, xg)
print('Train:', value_train)
print('Test:', value_test) """

" from xgboost import XGBClassifier #0.64/0.45\nX_train, X_test, y_train, y_test = train_test_split(X_robust,y_encoded_df, test_size = 0.3, \n                                                  random_state = 0, \n                                                  stratify = y_encoded_df, \n                                                  shuffle = True)\nxg = XGBClassifier(\n            n_estimators=250,\n            learning_rate=0.1,\n            max_depth=6,\n            random_state=42,\n            n_jobs=2,\n            tree_method='hist',\n            enable_categorical=True,\n            objective='multi:softprob',\n            num_class=8,\n            eval_metric=['mlogloss', 'merror'],\n            use_label_encoder=False\n        )\n\n# Train with early stopping\neval_set = [(X_train, y_train)]\nxg.fit(\n    X_train, y_train,\n    eval_set=eval_set,\n    verbose=False\n)\nvalue_train = evaluate_model(X_train, y_train, xg)\nvalue_test = evaluate_model(X_test, y_test, xg)\npri

In [None]:
""" print('MLP')
mlp = MLPClassifier()
avg_f1_score(X_robust, y_encoded_df, mlp, rkf) """

" print('MLP')\nmlp = MLPClassifier()\navg_f1_score(X_robust, y_encoded_df, mlp, rkf) "

In [None]:
""" print('Naive Bayes') #0.37/0.32
cnb = CategoricalNB(alpha= 0.1)
gnb = GaussianNB(var_smoothing=1e-7)
model_cat = run_model(X_train_cat_encoded, y_train_encoded, cnb)
model_num = run_model(X_train_num_scaled, y_train_encoded, gnb)
# Obter probabilidades de previsão
prob_cat_train = model_cat.predict_proba(X_train_cat_encoded)
prob_num_train = model_num.predict_proba(X_train_num_scaled)
prob_cat_test = model_cat.predict_proba(X_test_cat_encoded)
prob_num_test = model_num.predict_proba(X_test_num_scaled)
# Combinar probabilidades (média)
prob_combined_train = (prob_cat_train + prob_num_train) / 2
prob_combined_test = (prob_cat_test + prob_num_test) / 2
#prob_combined_train = (0.7 * prob_cat_train + 0.3 * prob_num_train)
#prob_combined_test = (0.7 * prob_cat_test + 0.3 * prob_num_test)
# Predizer classe final
y_pred_combined_train = np.argmax(prob_combined_train, axis=1)
y_pred_combined_test = np.argmax(prob_combined_test, axis=1)
# Avaliar o modelo combinado
print('Train:', f1_score(y_train_encoded, y_pred_combined_train, average='macro'))# y_test_num ou y_test_cat são os mesmos
print('Test:', f1_score(y_test_encoded, y_pred_combined_test, average='macro')) """

Naive Bayes


IndexError: index 5 is out of bounds for axis 1 with size 5