In [113]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold

from scipy.stats import chi2_contingency
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

from sklearn.decomposition import PCA

from sklearn.linear_model import LassoCV
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [114]:
pd.set_option('display.max_columns', None)

## Explore the dataset

In [115]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [116]:
train.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Identifier,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
0,2019-12-30,31.0,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,5393875,2. NON-COMP,ST. LAWRENCE,N,SYRACUSE,,M,,44.0,RETAIL TRADE,I,,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662.0,0.0,Not Work Related,1.0
1,2019-08-30,46.0,N,2020-01-01,Y,1745.93,1973.0,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,1A. PRIVATE,5393091,4. TEMPORARY,WYOMING,N,ROCHESTER,2020-02-21,F,4.0,23.0,CONSTRUCTION,I,,97.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569.0,1.0,Not Work Related,4.0
2,2019-12-06,40.0,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,1A. PRIVATE,5393889,4. TEMPORARY,ORANGE,N,ALBANY,,M,,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMEN...,II,,79.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589.0,0.0,Not Work Related,6.0
3,,,,2020-01-01,,,,,,,,957648180,,,,,,,,,,,,,,,,,,,,,
4,2019-12-30,61.0,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,2A. SIF,5393887,2. NON-COMP,DUTCHESS,N,ALBANY,,M,,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,II,,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603.0,0.0,Not Work Related,1.0


In [117]:
test.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Identifier,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents
0,2022-12-24,19,N,2023-01-02,N,,2003.0,2023-01-02,,INDEMNITY INSURANCE CO OF,1A. PRIVATE,6165911,BRONX,N,NYC,,M,,48.0,TRANSPORTATION AND WAREHOUSING,IV,,31.0,"FALL, SLIP OR TRIP, NOC",10.0,CONTUSION,54.0,LOWER LEG,10466,1
1,2022-11-20,19,N,2023-01-02,N,,2003.0,2023-01-02,,A I U INSURANCE COMPANY,1A. PRIVATE,6166141,QUEENS,N,NYC,,F,,45.0,RETAIL TRADE,IV,,75.0,FALLING OR FLYING OBJECT,10.0,CONTUSION,10.0,MULTIPLE HEAD INJURY,11691,1
2,2022-12-26,59,N,2023-01-02,N,0.0,1963.0,2022-12-31,,AMGUARD INSURANCE COMPANY,1A. PRIVATE,6165907,WESTCHESTER,N,NYC,,F,,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMEN...,III,,68.0,STATIONARY OBJECT,49.0,SPRAIN OR TEAR,62.0,BUTTOCKS,10604,0
3,2022-12-28,55,N,2023-01-02,N,0.0,0.0,2023-01-02,,INDEMNITY INS. OF N AMERICA,1A. PRIVATE,6166047,QUEENS,N,NYC,,F,,48.0,TRANSPORTATION AND WAREHOUSING,IV,,25.0,FROM DIFFERENT LEVEL (ELEVATION),10.0,CONTUSION,53.0,KNEE,11411,6
4,2022-12-20,25,N,2023-01-02,N,0.0,1997.0,2022-12-31,,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,6166102,KINGS,N,NYC,,M,,55.0,MANAGEMENT OF COMPANIES AND ENTERPRISES,IV,,79.0,OBJECT BEING LIFTED OR HANDLED,40.0,LACERATION,37.0,THUMB,11212,5


## Modify the data

In [118]:
train.set_index('Claim Identifier', inplace=True)
test.set_index('Claim Identifier', inplace=True)
train.drop_duplicates(inplace=True)

In [119]:
train.dropna(subset=['Claim Injury Type'], inplace=True)

column with all nan

In [120]:
train.drop(columns=['OIICS Nature of Injury Description', 'WCB Decision', 'Carrier Name'], inplace=True)
test.drop(columns=['OIICS Nature of Injury Description', 'Carrier Name'], inplace=True)

In [121]:
train['WCIO Part Of Body Code'] = train['WCIO Part Of Body Code'].apply(lambda x: 0 if x < 0 else x)
test['WCIO Part Of Body Code'] = test['WCIO Part Of Body Code'].apply(lambda x: 0 if x < 0 else x)
## IN DATE
date_cols = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date'] 
for col in date_cols:
    # Convert to datetime
    train[col] = pd.to_datetime(train[col], errors='coerce')
    test[col] = pd.to_datetime(test[col], errors='coerce')
    
    # Extract year, month, and day
    train[f'{col}_Year'] = train[col].dt.year
    train[f'{col}_Month'] = train[col].dt.month
    train[f'{col}_Day'] = train[col].dt.day
    
    test[f'{col}_Year'] = test[col].dt.year
    test[f'{col}_Month'] = test[col].dt.month
    test[f'{col}_Day'] = test[col].dt.day
train.drop(columns=date_cols, inplace=True)
test.drop(columns=date_cols, inplace=True)
    
# IN INT
def to_int(train):
    int_cols = ['Age at Injury', 'Birth Year', 'IME-4 Count', 'Number of Dependents']
    for col in int_cols:
        train[col] = train[col].astype('int64')
    return train

# IN OBJECT
float_to_object = ['Industry Code', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code', 'WCIO Part Of Body Code']
train[float_to_object] = train[float_to_object].astype('object')

In [122]:
# classifiy them as Non-US residents
train['Zip Code'] = train['Zip Code'].apply(
    lambda x: x[:2] if isinstance(x, str) and len(x) == 5 and x.isdigit() else ('Non-US Resident' if pd.notna(x) else np.nan)
)
test['Zip Code'] = test['Zip Code'].apply(
    lambda x: x[:2] if isinstance(x, str) and len(x) == 5 and x.isdigit() else ('Non-US Resident' if pd.notna(x) else np.nan)
)

In [123]:
#zip codes that start with 1 come from NY state - where the data set is based
# we decide to divide those that are from NY from those that even though are US residents, are not from NY
train['Zip Code'] = np.where(
    (train['Zip Code'] != 'Unknown') & 
    (train['Zip Code'] != 'Non-US Resident') & 
    train['Zip Code'].notna() & 
    train['Zip Code'].str.startswith('1'), 
    'NY Resident', 
    np.where(
        (train['Zip Code'] != 'Unknown') & 
        (train['Zip Code'] != 'Non-US Resident') & 
        train['Zip Code'].notna(), 
        'non-NY US Residents', 
        train['Zip Code']
    )
)
test['Zip Code'] = np.where(
    (test['Zip Code'] != 'Unknown') & 
    (test['Zip Code'] != 'Non-US Resident') & 
    test['Zip Code'].notna() & 
    test['Zip Code'].str.startswith('1'), 
    'NY Resident', 
    np.where(
        (test['Zip Code'] != 'Unknown') & 
        (test['Zip Code'] != 'Non-US Resident') & 
        test['Zip Code'].notna(), 
        'non-NY US Residents', 
        test['Zip Code']
    )
)


print(train['Zip Code'].value_counts())
print() 
print('NaN:', train['Zip Code'].isna().sum())

Zip Code
NY Resident            503921
non-NY US Residents     26093
Non-US Resident         15374
Name: count, dtype: int64

NaN: 28637


### Spliting the data

In [124]:
def split_data(X, y, method=None):
    splits = []
    if method is None:
        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, 
                                                random_state = 0, 
                                                stratify = y, 
                                                shuffle = True)
        splits.append((X_train, X_test, y_train, y_test))
    elif isinstance(method, StratifiedKFold):
        for train_index, test_index in method.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            splits.append((X_train, X_test, y_train, y_test))
    else:
        for train_index, test_index in method.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            splits.append((X_train, X_test, y_train, y_test))

    processed_splits = []
    for X_train, X_test, y_train, y_test in splits:
        X_train_num = X_train.select_dtypes(include=np.number)
        X_test_num = X_test.select_dtypes(include=np.number)
        X_train_cat = X_train.select_dtypes(exclude=np.number)
        X_test_cat = X_test.select_dtypes(exclude=np.number)
        processed_splits.append((X_train_num, X_test_num, X_train_cat, X_test_cat, y_train, y_test))

    return processed_splits

### Impute missing values

In [125]:
def imputing(X_train_num, X_test_num, X_train_cat, X_test_cat):
    #Using median for numerical data
    num_imputer = SimpleImputer(strategy="median")
    X_train_num = pd.DataFrame(num_imputer.fit_transform(X_train_num), columns=X_train_num.columns)
    X_test_num = pd.DataFrame(num_imputer.transform(X_test_num), columns=X_test_num.columns)

    #Using most frequent for categorical data
    cat_imputer = SimpleImputer(strategy="most_frequent")
    X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train_cat), columns=X_train_cat.columns)
    X_test_cat = pd.DataFrame(cat_imputer.transform(X_test_cat), columns=X_test_cat.columns)

    return X_train_num, X_test_num, X_train_cat, X_test_cat

In [126]:
# Function to calculate IQR and identify outliers for a specific column
def identify_outliers_iqr_column(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
    return outliers

In [127]:
def outliers(X_train_num, y_train):
    not_voluntary = X_train_num['Average Weekly Wage'] != 0
    not_voluntary_df = X_train_num[not_voluntary]
    
    outliers_mask = identify_outliers_iqr_column(not_voluntary_df, 'Average Weekly Wage')
    outliers_indices = not_voluntary_df[outliers_mask].index

    X_train_num = X_train_num.drop(index=outliers_indices, errors='ignore')
    y_train = y_train.drop(index=outliers_indices, errors='ignore')
    
    return X_train_num, y_train

### Scaling

In [128]:
def scaling(X_train, X_test, scaler):
    scaler.fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns).set_index(X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns).set_index(X_test.index)

    return X_train_scaled, X_test_scaled

### Feature Selection

if var == 0 then drop

In [129]:
def variance(X_train, threshold, return_variances=False):
    variances = X_train.var()
    low_variance_cols = variances[variances == threshold].index.tolist()
    if return_variances:
        return low_variance_cols, variances.to_dict()
    return low_variance_cols

spearman correlation

In [130]:
def high_correlated_vars(X_train, threshold):
    cor_spearman = X_train.corr(method='spearman')
    correlated_pairs = []
    for i in range(len(cor_spearman.columns)):
        for j in range(i):
            correlation = cor_spearman.iloc[i, j]
            if abs(correlation) >= threshold:
                correlated_pairs.append({
                    "feature_1": cor_spearman.columns[i],
                    "feature_2": cor_spearman.columns[j],
                    "correlation": correlation
                })
    return correlated_pairs

chi square

In [131]:
def test_independence(x,y,alpha=0.05):        
    dfObserved = pd.crosstab(y,x) 
    if dfObserved.empty:
        print(f"Skipping column {x.name} due to empty observed table.")
        return None
    if x.nunique() <= 1:
        print(f"Skipping column {x.name} as it has <= 1 unique value.")
        return None
    chi2, p, dof, expected = stats.chi2_contingency(dfObserved.values)
    is_important = p < alpha
    result = {
        "feature": x.name,
        "p_value": p,
        "chi2_stat": chi2,
        "is_important": is_important
    }
    return result

In [132]:
def chi_square(X_train, y, alpha=0.05):
    if X_train.empty or y.empty:
        raise ValueError("X_train or y is empty.")
    if len(y.unique()) < 2:
        raise ValueError("y must have at least two unique classes.")
    results = []
    for var in X_train.columns:
        test_result = test_independence(X_train[var], y, alpha)
        if test_result is None:
            print("Deu none")
        results.append(test_result)
    
    results_df = pd.DataFrame(results)
    not_important_features = results_df[~results_df["is_important"]]["feature"].tolist()
    
    return results_df, not_important_features

relation with the dependent variable

In [133]:
# def bar_charts_categorical(df, feature, target):
#     cont_tab = pd.crosstab(df[feature], df[target], margins=True)
#     categories = cont_tab.index[:-1]
#     target_categories = cont_tab.columns[:-1]
    
#     fig = plt.figure(figsize=(15, 5))
    
#     plt.subplot(121)
#     bottom = np.zeros(len(categories))
#     colors = plt.cm.tab20.colors  # Use a colormap for different colors
#     bars = []
#     for i, target_cat in enumerate(target_categories):
#         bar = plt.bar(categories, cont_tab.iloc[:-1, i].values, 0.55, bottom=bottom, color=colors[i % len(colors)])
#         bars.append(bar[0])
#         bottom += cont_tab.iloc[:-1, i].values
#     plt.legend(bars, [f'$y_i={cat}$' for cat in target_categories])
#     plt.title("Frequency bar chart")
#     plt.xlabel(feature)
#     plt.ylabel("$Frequency$")

#     # auxiliary data for 122
#     obs_pct = np.array([np.divide(cont_tab.iloc[:-1, i].values, cont_tab.iloc[:-1, -1].values) for i in range(len(target_categories))])
    
#     plt.subplot(122)
#     bottom = np.zeros(len(categories))
#     bars = []
#     for i, target_cat in enumerate(target_categories):
#         bar = plt.bar(categories, obs_pct[i], 0.55, bottom=bottom, color=colors[i % len(colors)])
#         bars.append(bar[0])
#         bottom += obs_pct[i]
#     plt.legend(bars, [f'$y_i={cat}$' for cat in target_categories])
#     plt.title("Proportion bar chart")
#     plt.xlabel(feature)
#     plt.ylabel("$p$")

#     plt.show()

# def plot_and_test_correlation(df, target):
#     for feature in df.select_dtypes(include='object').columns:
#         print(f"Generating bar charts for {feature}...")
#         bar_charts_categorical(df, feature, target)

# plot_and_test_correlation(train, 'Claim Injury Type')

rfe

In [134]:
def select_optimal_features_rfe(X_train, y_train, X_val, y_val, model, scoring_function=None):
    if scoring_function is None:
        scoring_function = lambda model, X, y: model.score(X, y)

    nof_list=np.arange(1, X_train.shape[1]+1)
    high_score = 0
    nof = 0
    train_score_list = []
    val_score_list = []

    for n in nof_list:
        rfe = RFE(estimator=model, n_features_to_select=n)
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_val_rfe = rfe.transform(X_val)
        model.fit(X_train_rfe, y_train)

        # Storing results on training data
        train_score = scoring_function(model, X_train_rfe, y_train)
        train_score_list.append(train_score)

        # Storing results on validation data
        val_score = scoring_function(model, X_val_rfe, y_val)
        val_score_list.append(val_score)

        # Check best score
        if val_score >= high_score:
            high_score = val_score
            nof = n

    # Fit RFE with the optimal number of features
    rfe = RFE(estimator=model, n_features_to_select=nof)
    rfe.fit(X_train, y_train)
    selected_features = X_train.columns[rfe.support_].tolist()

    return selected_features, train_score_list, val_score_list

embedded methods

In [135]:
def select_best_features_embedded(X_train, y_train, model, threshold=None):
    # Fit the model
    model.fit(X_train, y_train)
    
    # Get the coefficients or feature importances
    if hasattr(model, 'coef_'):
        if model.coef_.ndim > 1:
            coef = pd.Series(model.coef_.mean(axis=0), index=X_train.columns)
        else:
            coef = pd.Series(model.coef_, index=X_train.columns)
    elif hasattr(model, 'feature_importances_'):
        coef = pd.Series(model.feature_importances_, index=X_train.columns)
    else:
        raise ValueError("The model does not have coef_ or feature_importances_ attributes")
    
    if threshold is not None:
        selected_features = coef[coef.abs() > threshold].index.tolist()
    else:
        selected_features = coef[coef != 0].index.tolist()
    
    return selected_features, coef[selected_features]

### Reducing Cardinality

In [136]:
def reduce_cardinality(df, threshold=10, other_label='Other'):
    for col in df.select_dtypes(include='object').columns:
        value_counts = df[col].value_counts()
        frequent_values = value_counts[value_counts > threshold].index
        df[col] = df[col].apply(lambda x: x if x in frequent_values else other_label)
    return df

### Encoding

In [137]:
def encoding_independent(X_train, X_test, encoder):
    X_train = X_train.astype(str)
    X_test = X_test.astype(str)
    
    encoder.fit(X_train)
    X_train_encoded = encoder.transform(X_train) 
    X_test_encoded = encoder.transform(X_test)

    if isinstance(encoder, OneHotEncoder):
        feature_names = encoder.get_feature_names_out(X_train.columns)
        X_train_encoded = pd.DataFrame(X_train_encoded, columns=feature_names, index=X_train.index)
        X_test_encoded = pd.DataFrame(X_test_encoded, columns=feature_names, index=X_test.index)
    else:
        X_train_encoded = pd.DataFrame(X_train_encoded, columns=X_train.columns, index=X_train.index)
        X_test_encoded = pd.DataFrame(X_test_encoded, columns=X_test.columns, index=X_test.index)
    
    return X_train_encoded, X_test_encoded

In [138]:
def encoding_dependent(y_train, y_test, encoder):
    encoder.fit(y_train)
    y_train_encoded = pd.Series(encoder.transform(y_train))
    y_test_encoded = pd.Series(encoder.transform(y_test))

    return y_train_encoded, y_test_encoded

### Balancing Classes

In [139]:
def balance_data(X, y, method='oversample'):
    if method == 'oversample':
        sampler = RandomOverSampler(random_state=42)
    elif method == 'undersample':
        sampler = RandomUnderSampler(random_state=42)
    elif method == 'smote':
        sampler = SMOTEENN(random_state=42)
    else:
        raise ValueError("Method should be 'oversample', 'undersample', or 'smote'")
    
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    return X_resampled, y_resampled

### PCA

In [140]:
def apply_pca(X_train, X_test, n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    pca_feat_names = [f'PC{i}' for i in range(n_components)]

    X_train_pca = pd.DataFrame(X_train_pca, index=X_train.index, columns=pca_feat_names)
    X_test_pca = pd.DataFrame(X_test_pca, index=X_test.index, columns=pca_feat_names)
    
    return X_train_pca, X_test_pca, pca_feat_names

## Modelling and Evaluating

In [141]:
def run_model(X,y, model):
    return model.fit(X, y)

In [142]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

def evaluate_model(X, y, model, is_classification=True):
    predictions = model.predict(X)
    if is_classification:
        return classification_report(y, predictions)
    else:
        mse = mean_squared_error(y, predictions)
        r2 = r2_score(y, predictions)
        return {
            'mean_squared_error': mse,
            'r2_score': r2
        }

In [143]:
""" def pipeline(X, y, method, scaler, encoder_independent, encoder_dependent, model): #, balance_method
    splits = split_data(X, y, method)
    print("Split data OK.")

    # Initialize results storage for each split
    results = {}

    for i, (X_train_num, X_test_num, X_train_cat, X_test_cat, y_train, y_test) in enumerate(splits):
        X_train_num, X_test_num, X_train_cat, X_test_cat = imputing(X_train_num, X_test_num, X_train_cat, X_test_cat)
        print(f"Imputing OK for split {i + 1}.")

        X_train_num = to_int(X_train_num)
        X_test_num = to_int(X_test_num)
        
        # X_train_num, y_train = outliers(X_train_num, y_train)
        
        X_train_num_scaled, X_test_num_scaled = scaling(X_train_num, X_test_num, scaler)
        print(f"Scaling OK for split {i + 1}.")
    
        # unique_counts = X_train_cat.nunique()
        # X_train_cat.drop(columns=unique_counts[unique_counts == 1].index, inplace=True)
        # X_test_cat.drop(columns=unique_counts[unique_counts == 1].index, inplace=True)

        X_train_cat = reduce_cardinality(X_train_cat)
        X_test_cat = reduce_cardinality(X_test_cat)
        print(f"Reducing cardinality OK for split {i + 1}.")
        
        X_train_cat_encoded, X_test_cat_encoded = encoding_independent(X_train_cat, X_test_cat, encoder_independent)
        print(f"Encoding independent OK for split {i + 1}.")
        
        y_train_encoded, y_test_encoded = encoding_dependent(y_train, y_test, encoder_dependent)
        print(f"Encoding dependent OK for split {i + 1}.")
        
        X_train = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
        X_test = pd.concat([X_test_num_scaled, X_test_cat_encoded], axis=1)
        print(f"Concatenating OK for split {i + 1}.")

        # if balance_method in ['oversample', 'undersample', 'smote']:
        #     X_train, y_train_encoded = balance_data(X_train, y_train_encoded, method=balance_method)
        #     print(f"Balancing OK for split {i + 1}.")
        
        model = run_model(X_train, y_train_encoded, model)
        print(f"Model fitted OK for split {i + 1}.")

        is_classification = not isinstance(model, LassoCV)
        
        results[f"split_{i + 1}"] = evaluate_model(X_test, y_test_encoded, model, is_classification)
        print(f"Evaluation OK for split {i + 1}.")
        
    
    return results """

' def pipeline(X, y, method, scaler, encoder_independent, encoder_dependent, model): #, balance_method\n    splits = split_data(X, y, method)\n    print("Split data OK.")\n\n    # Initialize results storage for each split\n    results = {}\n\n    for i, (X_train_num, X_test_num, X_train_cat, X_test_cat, y_train, y_test) in enumerate(splits):\n        X_train_num, X_test_num, X_train_cat, X_test_cat = imputing(X_train_num, X_test_num, X_train_cat, X_test_cat)\n        print(f"Imputing OK for split {i + 1}.")\n\n        X_train_num = to_int(X_train_num)\n        X_test_num = to_int(X_test_num)\n        \n        # X_train_num, y_train = outliers(X_train_num, y_train)\n        \n        X_train_num_scaled, X_test_num_scaled = scaling(X_train_num, X_test_num, scaler)\n        print(f"Scaling OK for split {i + 1}.")\n    \n        # unique_counts = X_train_cat.nunique()\n        # X_train_cat.drop(columns=unique_counts[unique_counts == 1].index, inplace=True)\n        # X_test_cat.drop(colu

In [144]:
""" X = train.drop('Claim Injury Type', axis=1)
y = train['Claim Injury Type']

# Define configurations
cv_methods = [None] #, KFold(n_splits=10), RepeatedKFold(n_splits=6, n_repeats=2), StratifiedKFold(n_splits=10)
scalers = [StandardScaler()] #, MinMaxScaler()
encoders = [OneHotEncoder(sparse_output=False, drop="first", handle_unknown='ignore')] # , OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
#balance_methods = ['oversample', 'undersample', 'smote']
models = [
    ("DecisionTree", DecisionTreeClassifier()),
    ("LogisticRegression", LogisticRegression(class_weight='balanced', max_iter=1000)),
    ("Lasso", LassoCV())
]

# Results storage
results = []

# Iterate through combinations
for scaler in scalers:
    for encoder in encoders:
        for model_name, model in models:
            for cv_method in cv_methods:
                #for balance_method in balance_methods:
                    if model_name in ["LogisticRegression", "Lasso"] and isinstance(encoder, OrdinalEncoder):
                        continue  # Skip OrdinalEncoder for LogisticRegression and Lasso
                    # if model_name == "LogisticRegression":
                    #     balance_method = None  # LogisticRegression does not need balancing
                    
                    # Apply pipeline steps (e.g., split, scale, encode, model)
                    try:
                        pipeline_result = pipeline(X, y, cv_method, scaler, encoder, LabelEncoder(), model) #, balance_method
                        results.append({
                            "scaler": scaler.__class__.__name__,
                            "encoder": encoder.__class__.__name__,
                            "model": model_name,
                            "cv_method": "TrainTestSplit" if cv_method is None else cv_method.__class__.__name__,
                            #"balance_method": balance_method if balance_method else 'none',
                            "results": pipeline_result
                        })
                        print("========================================")
                        print(f"Scaler: {scaler.__class__.__name__}")
                        print(f"Encoder: {encoder.__class__.__name__}")
                        print(f"Model: {model_name}")
                        print(f"Cross-Validation Method: {'TrainTestSplit' if cv_method is None else cv_method.__class__.__name__}")
                        #print(f"Balancing Method: {balance_method if balance_method else 'none'}")
                        print("Results:")

                        for split, split_result in pipeline_result.items():
                            print(f"  {split}:")
                            if isinstance(split_result, dict):  # For multiple metrics per fold
                                for metric, value in split_result.items():
                                    print(f"    {metric.upper()}:")
                                    if isinstance(value, str):  # Classification reports
                                        print(value)
                                    else:  # Other metrics (e.g., accuracy, F1-score)
                                        print(f"      Value: {value}")
                            elif isinstance(split_result, str):  # If it's just a string, like a report
                                print(split_result)
                            else:  # Unexpected format
                                print(f"    Unexpected format: {split_result}")
                        print("========================================")

                    except Exception as e:
                        print(f"Error with {scaler}, {encoder}, {model_name}, {cv_method}: {e}") #, {balance_method}
 """



In [145]:
normal_split = None
kf = KFold(n_splits=10) #if the splits are too many, poor efficiency
rkf = RepeatedKFold(n_splits=6, n_repeats=2)  
#loo = LeaveOneOut() not good due the size of the dataset
skf = StratifiedKFold(n_splits=10)  #good for imbalanced datasets

min_max = MinMaxScaler()
min_max2 = MinMaxScaler(feature_range=(-1, 1))
standard = StandardScaler()
robust = RobustScaler()

oneHot = OneHotEncoder(sparse_output=False, drop="first")
ordinal = OrdinalEncoder()
label = LabelEncoder()

dt = DecisionTreeClassifier()#0.99/0.39 not use StratifiedKFold
#svc = SVC() to expensive for the dataset
lasso = LassoCV() #0.74/0.41
log_reg = LogisticRegression() #0.78/0.37

In [156]:
# Função para combinar GaussianNB e CategoricalNB
def hybrid_naive_bayes(X_num_train, X_cat_train, y_train, X_num_test, X_cat_test):
    # Treinar GaussianNB para dados numéricos
    gnb = GaussianNB()
    gnb.fit(X_num_train, y_train)
    
    # Treinar CategoricalNB para dados categóricos
    cnb = CategoricalNB()
    cnb.fit(X_cat_train, y_train)

    # Predizer probabilidades com ambos os modelos
    prob_gnb = gnb.predict_proba(X_num_test)
    prob_cnb = cnb.predict_proba(X_cat_test)

    # Combinar probabilidades multiplicando-as (assumindo independência)
    combined_prob = prob_gnb * prob_cnb

    # Retornar a classe com maior probabilidade combinada
    return np.argmax(combined_prob, axis=1)

In [None]:
X = train.drop('Claim Injury Type', axis=1)
y = train['Claim Injury Type']

# Perform PCA once on the entire training data
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_full_num = X_train_full.select_dtypes(include=np.number)
X_test_full_num = X_test_full.select_dtypes(include=np.number)
X_train_full_cat = X_train_full.select_dtypes(exclude=np.number)
X_test_full_cat = X_test_full.select_dtypes(exclude=np.number)
    
# Preprocess the full training data
X_train_full_num, X_test_full_num, X_train_full_cat, X_test_full_cat = imputing(X_train_full_num, X_test_full_num, X_train_full_cat, X_test_full_cat)
X_train_full_num = to_int(X_train_full_num)
X_test_full_num = to_int(X_test_full_num)
X_train_full_num_scaled, X_test_full_num_scaled = scaling(X_train_full_num, X_test_full_num, StandardScaler())
X_train_full_cat = reduce_cardinality(X_train_full_cat)
X_test_full_cat = reduce_cardinality(X_test_full_cat)
X_train_full_cat_encoded, X_test_full_cat_encoded = encoding_independent(X_train_full_cat, X_test_full_cat, OneHotEncoder(sparse_output=False, drop="first", handle_unknown='ignore'))
y_train_full_encoded, y_test_full_encoded = encoding_dependent(y_train_full, y_test_full, LabelEncoder())
X_train_full = pd.concat([X_train_full_num_scaled, X_train_full_cat_encoded], axis=1)
X_test_full = pd.concat([X_test_full_num_scaled, X_test_full_cat_encoded], axis=1)

""" # Perform PCA
pca = PCA()
pca_feat = pca.fit_transform(X_train_full)

# Get PCA output as table
explained_variance = pca.explained_variance_
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance_ratio = np.cumsum(explained_variance_ratio)

# Combine into a dataframe
pca_results = pd.DataFrame(
    {
        "Eigenvalue": explained_variance,
        "Difference": np.insert(np.diff(explained_variance), 0, 0),
        "Proportion": explained_variance_ratio,
        "Cumulative": cumulative_explained_variance_ratio
    },
    index=range(1, pca.n_components_ + 1)
)

print(pca_results)

# figure and axes
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(200, 10))

# draw plots

ax1.plot(explained_variance, # PLOT THE EIGENVALUES (EXPLAINED VARIANCE)
        marker=".", markersize=12)

ax2.plot(explained_variance_ratio,  # PLOT THE EXPLAINED VARIANCE RATIO
        marker=".", markersize=12, label="Proportion")

ax2.plot(cumulative_explained_variance_ratio,  # PLOT THE CUMULATIVE EXPLAINED VARIANCE RATIO
        marker=".", markersize=12, linestyle="--", label="Cumulative")

# customizations
ax2.legend()
ax1.set_title("Scree Plot", fontsize=14)
ax2.set_title("Variance Explained", fontsize=14)
ax1.set_ylabel("Eigenvalue")
ax2.set_ylabel("Proportion")
ax1.set_xlabel("Components")
ax2.set_xlabel("Components")
ax1.set_xticks(range(0, pca.n_components_, 2))
ax1.set_xticklabels(range(1, pca.n_components_ + 1, 2))
ax2.set_xticks(range(0, pca.n_components_, 2))
ax2.set_xticklabels(range(1, pca.n_components_ + 1, 2))

plt.show() """

#choosing a pca with 30 components

X_train_pca, X_test_pca, pca_feat_names = apply_pca(X_train_full, X_test_full, n_components=30)
features = X_train_full.columns.tolist()

loadings = pd.DataFrame(
    np.dot(X_train_full.T, X_train_pca) / len(X_train_full),
    index=features,
    columns=pca_feat_names
)

def _color_red_or_green(val):
        if val < -0.45:
                color = 'background-color: #ffbdbd'
        elif val > 0.45:
                color = 'background-color: #b3ffcc'
        else:
                color = ''
        return color

# Interpreting each Principal Component
styled_loadings = loadings.style.applymap(_color_red_or_green)

# Display the styled loadings
#display(styled_loadings)

# Remove PCA components 14 to 29
components_to_remove = [f'PC{i}' for i in range(14, 30)]
X_train_pca.drop(columns=components_to_remove, inplace=True)
X_test_pca.drop(columns=components_to_remove, inplace=True)
pca_feat_names = [f'PC{i}' for i in range(1, 14)]  # Update pca_feat_names to reflect the remaining components

model = run_model(X_train_full, y_train_full_encoded, DecisionTreeClassifier(max_depth = 10))
model2 = run_model(X_train_full, y_train_full_encoded, DecisionTreeClassifier(max_depth = 20))
model3 = run_model(X_train_full, y_train_full_encoded, DecisionTreeClassifier(max_depth = 15))
model4 = run_model(X_train_full, y_train_full_encoded, DecisionTreeClassifier(min_samples_split = 100))
model5 = run_model(X_train_full, y_train_full_encoded, DecisionTreeClassifier(min_samples_split = 200)) 
model6 = run_model(X_train_full, y_train_full_encoded, DecisionTreeClassifier(min_samples_split = 500))
model7 = run_model(X_train_full, y_train_full_encoded, DecisionTreeClassifier(min_samples_split = 700))
#model_pca = run_model(X_train_pca, y_train_full_encoded, DecisionTreeClassifier(criterion = 'entropy'))

#is_classification = not isinstance(model, LassoCV)
        
results = evaluate_model(X_test_full, y_test_full_encoded, model)
results2 = evaluate_model(X_test_full, y_test_full_encoded, model2)
results3 = evaluate_model(X_test_full, y_test_full_encoded, model3)
results4 = evaluate_model(X_test_full, y_test_full_encoded, model4)
results5 = evaluate_model(X_test_full, y_test_full_encoded, model5)
results6 = evaluate_model(X_test_full, y_test_full_encoded, model6)
results7 = evaluate_model(X_test_full, y_test_full_encoded, model7)
#results_pca = evaluate_model(X_test_pca, y_test_full_encoded, model_pca)
print(results, results2, results3, results4, results5, results6, results7) #, results_pca
#print(results_pca)


""" # For Naive Bayes
# Realizar predição usando o modelo híbrido
predictions = hybrid_naive_bayes(
    X_train_full_num_scaled, X_train_full_cat_encoded, y_train_full_encoded,
    X_test_full_num_scaled, X_test_full_cat_encoded
)

# Função para avaliar o modelo
def evaluate_model_nb(y_true, y_pred, is_classification=True):
    if is_classification:
        return classification_report(y_true, y_pred)
    else:
        mse = mean_squared_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        return {
            'mean_squared_error': mse,
            'r2_score': r2
        }

# Avaliar o modelo (assumindo que "evaluate_model" retorna métricas relevantes)
results = evaluate_model_nb(y_test_full_encoded, predictions)
print(results) """


              precision    recall  f1-score   support

           0       0.64      0.51      0.57      2461
           1       0.85      0.98      0.91     58173
           2       0.47      0.05      0.08     13850
           3       0.72      0.91      0.80     29795
           4       0.67      0.43      0.53      9579
           5       0.17      0.00      0.00       835
           6       0.00      0.00      0.00        16
           7       0.43      0.10      0.17        96

    accuracy                           0.79    114805
   macro avg       0.49      0.37      0.38    114805
weighted avg       0.74      0.79      0.74    114805
               precision    recall  f1-score   support

           0       0.63      0.47      0.54      2461
           1       0.85      0.97      0.91     58173
           2       0.34      0.11      0.16     13850
           3       0.76      0.83      0.79     29795
           4       0.59      0.60      0.60      9579
           5       0.10 

' # For Naive Bayes\n# Realizar predição usando o modelo híbrido\npredictions = hybrid_naive_bayes(\n    X_train_full_num_scaled, X_train_full_cat_encoded, y_train_full_encoded,\n    X_test_full_num_scaled, X_test_full_cat_encoded\n)\n\n# Função para avaliar o modelo\ndef evaluate_model_nb(y_true, y_pred, is_classification=True):\n    if is_classification:\n        return classification_report(y_true, y_pred)\n    else:\n        mse = mean_squared_error(y_true, y_pred)\n        r2 = r2_score(y_true, y_pred)\n        return {\n            \'mean_squared_error\': mse,\n            \'r2_score\': r2\n        }\n\n# Avaliar o modelo (assumindo que "evaluate_model" retorna métricas relevantes)\nresults = evaluate_model_nb(y_test_full_encoded, predictions)\nprint(results) '

In [None]:
""" print('Naive Bayes') #0.37/0.32
cnb = CategoricalNB(alpha= 0.1)
gnb = GaussianNB(var_smoothing=1e-7)
model_cat = run_model(X_train_cat_encoded, y_train_encoded, cnb)
model_num = run_model(X_train_num_scaled, y_train_encoded, gnb)
# Obter probabilidades de previsão
prob_cat_train = model_cat.predict_proba(X_train_cat_encoded)
prob_num_train = model_num.predict_proba(X_train_num_scaled)
prob_cat_test = model_cat.predict_proba(X_test_cat_encoded)
prob_num_test = model_num.predict_proba(X_test_num_scaled)
# Combinar probabilidades (média)
prob_combined_train = (prob_cat_train + prob_num_train) / 2
prob_combined_test = (prob_cat_test + prob_num_test) / 2
#prob_combined_train = (0.7 * prob_cat_train + 0.3 * prob_num_train)
#prob_combined_test = (0.7 * prob_cat_test + 0.3 * prob_num_test)
# Predizer classe final
y_pred_combined_train = np.argmax(prob_combined_train, axis=1)
y_pred_combined_test = np.argmax(prob_combined_test, axis=1)
# Avaliar o modelo combinado
print('Train:', f1_score(y_train_encoded, y_pred_combined_train, average='macro'))# y_test_num ou y_test_cat são os mesmos
print('Test:', f1_score(y_test_encoded, y_pred_combined_test, average='macro')) """

" print('Naive Bayes') #0.37/0.32\ncnb = CategoricalNB(alpha= 0.1)\ngnb = GaussianNB(var_smoothing=1e-7)\nmodel_cat = run_model(X_train_cat_encoded, y_train_encoded, cnb)\nmodel_num = run_model(X_train_num_scaled, y_train_encoded, gnb)\n# Obter probabilidades de previsão\nprob_cat_train = model_cat.predict_proba(X_train_cat_encoded)\nprob_num_train = model_num.predict_proba(X_train_num_scaled)\nprob_cat_test = model_cat.predict_proba(X_test_cat_encoded)\nprob_num_test = model_num.predict_proba(X_test_num_scaled)\n# Combinar probabilidades (média)\nprob_combined_train = (prob_cat_train + prob_num_train) / 2\nprob_combined_test = (prob_cat_test + prob_num_test) / 2\n#prob_combined_train = (0.7 * prob_cat_train + 0.3 * prob_num_train)\n#prob_combined_test = (0.7 * prob_cat_test + 0.3 * prob_num_test)\n# Predizer classe final\ny_pred_combined_train = np.argmax(prob_combined_train, axis=1)\ny_pred_combined_test = np.argmax(prob_combined_test, axis=1)\n# Avaliar o modelo combinado\nprint('Tr

## Deploy

In [None]:
test

Unnamed: 0_level_0,Age at Injury,Alternative Dispute Resolution,Attorney/Representative,Average Weekly Wage,Birth Year,Carrier Type,County of Injury,COVID-19 Indicator,District Name,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents,Accident Date_Year,Accident Date_Month,Accident Date_Day,Assembly Date_Year,Assembly Date_Month,Assembly Date_Day,C-2 Date_Year,C-2 Date_Month,C-2 Date_Day,C-3 Date_Year,C-3 Date_Month,C-3 Date_Day,First Hearing Date_Year,First Hearing Date_Month,First Hearing Date_Day
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
6165911,19,N,N,,2003.0,1A. PRIVATE,BRONX,N,NYC,M,,48.0,TRANSPORTATION AND WAREHOUSING,IV,31.0,"FALL, SLIP OR TRIP, NOC",10.0,CONTUSION,54.0,LOWER LEG,NY Resident,1,2022.0,12.0,24.0,2023,1,2,2023.0,1.0,2.0,,,,,,
6166141,19,N,N,,2003.0,1A. PRIVATE,QUEENS,N,NYC,F,,45.0,RETAIL TRADE,IV,75.0,FALLING OR FLYING OBJECT,10.0,CONTUSION,10.0,MULTIPLE HEAD INJURY,NY Resident,1,2022.0,11.0,20.0,2023,1,2,2023.0,1.0,2.0,,,,,,
6165907,59,N,N,0.0,1963.0,1A. PRIVATE,WESTCHESTER,N,NYC,F,,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMEN...,III,68.0,STATIONARY OBJECT,49.0,SPRAIN OR TEAR,62.0,BUTTOCKS,NY Resident,0,2022.0,12.0,26.0,2023,1,2,2022.0,12.0,31.0,,,,,,
6166047,55,N,N,0.0,0.0,1A. PRIVATE,QUEENS,N,NYC,F,,48.0,TRANSPORTATION AND WAREHOUSING,IV,25.0,FROM DIFFERENT LEVEL (ELEVATION),10.0,CONTUSION,53.0,KNEE,NY Resident,6,2022.0,12.0,28.0,2023,1,2,2023.0,1.0,2.0,,,,,,
6166102,25,N,N,0.0,1997.0,1A. PRIVATE,KINGS,N,NYC,M,,55.0,MANAGEMENT OF COMPANIES AND ENTERPRISES,IV,79.0,OBJECT BEING LIFTED OR HANDLED,40.0,LACERATION,37.0,THUMB,NY Resident,5,2022.0,12.0,20.0,2023,1,2,2022.0,12.0,31.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553137,52,N,N,,1960.0,2A. SIF,JEFFERSON,N,SYRACUSE,M,,,,I,,,,,,,NY Resident,5,2012.0,9.0,12.0,2024,6,5,2012.0,10.0,23.0,,,,,,
6553119,59,N,Y,0.0,1965.0,3A. SELF PUBLIC,SUFFOLK,N,HAUPPAUGE,F,,,,IV,,,,,,,NY Resident,1,2024.0,5.0,22.0,2024,6,5,,,,2024.0,5.0,28.0,,,
6553542,45,N,Y,0.0,1979.0,2A. SIF,QUEENS,N,NYC,M,,,,IV,,,,,,,NY Resident,5,2024.0,5.0,6.0,2024,6,5,,,,,,,,,
6553455,42,N,Y,0.0,1981.0,4A. SELF PRIVATE,QUEENS,N,NYC,M,,,,IV,,,,,,,NY Resident,5,2024.0,2.0,24.0,2024,6,5,,,,2024.0,5.0,21.0,,,


In [None]:
test['Claim Injury Type'] = dt.predict(test)

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
test

In [None]:
#export test data predictions
#test['DrugPlant'].to_csv('Exercise1_predictions.csv')