## 1. Imports

In [394]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Scaling
from sklearn.preprocessing import MinMaxScaler

 #Correlation Heatmap
from matplotlib.colors import LinearSegmentedColormap

#Statistical Test
from scipy import stats
from sklearn.impute import SimpleImputer

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import Ridge

pd.set_option('display.max_columns', None)

In [395]:
# Read the dataset
X_train_encoded = pd.read_csv('../project_data/X_train_encoded.csv', delimiter=',', index_col=0)
X_val_encoded = pd.read_csv('../project_data/X_val_encoded.csv', delimiter=',', index_col=0)

y_train = pd.read_csv('../project_data/y_train.csv',delimiter=',', index_col=0)
y_val= pd.read_csv('../project_data/y_val.csv', delimiter=',', index_col=0)

X_test_encoded = pd.read_csv('../project_data/X_test_encoded.csv',index_col=0)

In [396]:
# Binary classification dataset
X_train_encoded_bin = pd.read_csv('../project_data/X_train_encoded_binary.csv', delimiter=',', index_col=0)
X_val_encoded_bin = pd.read_csv('../project_data/X_val_encoded_binary.csv', delimiter=',', index_col=0)

y_train_bin = pd.read_csv('../project_data/y_train_binary.csv',delimiter=',', index_col=0)
y_val_bin= pd.read_csv('../project_data/y_val_binary.csv', delimiter=',', index_col=0)

X_test_encoded_bin = pd.read_csv('../project_data/X_test_encoded_binary.csv',index_col=0)

## 2. Preparing the data

### 2.1 Encoding multiclass target

In [397]:
claim_injury_type_mapping = {
    '4. TEMPORARY': 0,
    '2. NON-COMP':7,
    '5. PPD SCH LOSS': 1,
    '3. MED ONLY': 2,
    '6. PPD NSL': 3,
    '1. CANCELLED': 4,
    '8. DEATH':5,
    '7. PTD': 6
}

y_train_encoded = y_train['Claim Injury Type'].map(claim_injury_type_mapping)
y_val_encoded = y_val['Claim Injury Type'].map(claim_injury_type_mapping)

In [398]:

# Ensure y_train is a Series (if it's a DataFrame with one column, convert it to Series)
y_train = y_train.squeeze()  # This will convert DataFrame with a single column to a Series

# Create a mask for rows where '2. NON-COMP' is not present
mask = y_train != '2. NON-COMP'

# Apply the mask to filter out '2. NON-COMP'
y_train_not_encoded = y_train[mask]

# Print the shape of the filtered target
print("Shape of y_train_not_encoded:", y_train_not_encoded.shape)

# Check the unique classes in the filtered target to confirm that '2. NON-COMP' is removed
print("Classes in y_train_not_encoded:", y_train_not_encoded.unique())


Shape of y_train_not_encoded: (193852,)
Classes in y_train_not_encoded: ['5. PPD SCH LOSS' '3. MED ONLY' '4. TEMPORARY' '1. CANCELLED' '8. DEATH'
 '6. PPD NSL' '7. PTD']


In [399]:
# Create a mask for rows where y_train_encoded is not equal to 7 (non-comp class)
mask_train = y_train_encoded != 7

# Filter X_train_encoded and y_train_encoded based on the mask
X_train_encoded = X_train_encoded[mask_train]
y_train_encoded = y_train_encoded[mask_train]

# Apply the same mask to y_val_encoded and X_val_encoded
mask_val = y_val_encoded != 7
X_val_encoded = X_val_encoded[mask_val]
y_val_encoded = y_val_encoded[mask_val]

# Verify the shapes of the new filtered datasets
print("Shape of X_train_encoded_filtered:", X_train_encoded.shape)
print("Shape of y_train_encoded_filtered:", y_train_encoded.shape)
print("Shape of X_val_encoded_filtered:", X_val_encoded.shape)
print("Shape of y_val_encoded_filtered:", y_val_encoded.shape)

# Check unique classes in the target to confirm "non comp" is removed
print("Classes in y_train_encoded_filtered:", y_train_encoded.unique())
print("Classes in y_val_encoded_filtered:", y_val_encoded.unique())  # Corrected this line


Shape of X_train_encoded_filtered: (193852, 59)
Shape of y_train_encoded_filtered: (193852,)
Shape of X_val_encoded_filtered: (84881, 59)
Shape of y_val_encoded_filtered: (84881,)
Classes in y_train_encoded_filtered: [1 2 0 4 5 3 6]
Classes in y_val_encoded_filtered: [2 4 0 1 3 6 5]


### 2.2 Separate Numerical and Categorical

#### 2.2.1  Binary separation

In [400]:
num_columns_bin = ['Age at Injury', 
                   'IME-4 Count', 
                   'Number of Dependents',

                   'Industry Code_encoded_0',
                   'Industry Code_encoded_1', 

                   'WCIO Cause of Injury Code_encoded_0',
                   'WCIO Cause of Injury Code_encoded_1',

                   'WCIO Nature of Injury Code_encoded_0',
                   'WCIO Nature of Injury Code_encoded_1',

                   'WCIO Part Of Body Code_encoded_0', 
                   'WCIO Part Of Body Code_encoded_1',

                   'Industry Code_freq', 
                   'WCIO Cause of Injury Code_freq',
                   'WCIO Nature of Injury Code_freq',
                   'WCIO Part Of Body Code_freq',
                   'Carrier Type Imputed_freq', 
                   'Carrier Name_freq',

                   'Accident Datemonth',
                   'Accident Date_Season_Spring', 
                   'Accident Date_Season_Summer',
                   'Accident Date_Season_Winter',

                   'Days_between_Assembly Date_Accident Date_log',
                   'Days_between_C-2 Date Imputed_Accident Date_log',
                   'Average Weekly Wage Imputed_log']

cat_columns_bin = ['Carrier Type_Self-insured Private Entity',
                   'Carrier Type_Self-insured Public Entity', 
                   'Carrier Type_Special Funds',
                   'Carrier Type_State Insurance Fund', 
                   'Carrier Type_nan',   
                   'C-3 Date_nabinary', 
                   'Average Weekly Wage_nabinary',
                   'First Hearing Date_nabinary',
                   'Alternative Dispute Resolution_binary',
                   'COVID-19 Indicator_binary',
                   'Attorney/Representative_binary']

# Create subsets
X_train_num_bin = X_train_encoded_bin[num_columns_bin]
X_train_cat_bin = X_train_encoded_bin[cat_columns_bin]

X_val_num_bin = X_val_encoded_bin[num_columns_bin]
X_val_cat_bin = X_val_encoded_bin[cat_columns_bin]

X_test_num_bin=X_test_encoded_bin[num_columns_bin]
X_test_cat_bin=X_test_encoded_bin[cat_columns_bin]

#### 2.2.2 Multiclass separation

> Drop the ones associated the the target class 2. NON-COMP

In [401]:
# Drop features associated with "non comp" in their name
columns_to_drop = [col for col in X_train_encoded.columns if "non-comp" in col.lower()]

# Drop the columns from the dataset
X_train_encoded_dropped = X_train_encoded.drop(columns=columns_to_drop)
X_val_encoded_dropped = X_val_encoded.drop(columns=columns_to_drop)  # If you have validation data
X_test_encoded_dropped = X_test_encoded.drop(columns=columns_to_drop)  # If you have test data

# Verify the remaining columns
print("Remaining columns after dropping 'non comp' features:")
print(len(X_train_encoded_dropped.columns))

Remaining columns after dropping 'non comp' features:
55


In [402]:
num_columns = ['Age at Injury', 
               'IME-4 Count', 
               'Number of Dependents',
               
               'Industry Code_encoded_5. PPD SCH LOSS',
               'Industry Code_encoded_3. MED ONLY',
               'Industry Code_encoded_4. TEMPORARY',
               'Industry Code_encoded_1. CANCELLED', 
               'Industry Code_encoded_8. DEATH',
               'Industry Code_encoded_6. PPD NSL', 
               'Industry Code_encoded_7. PTD',
               
               'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
               'WCIO Cause of Injury Code_encoded_3. MED ONLY',
               'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
               'WCIO Cause of Injury Code_encoded_1. CANCELLED',
               'WCIO Cause of Injury Code_encoded_8. DEATH',
               'WCIO Cause of Injury Code_encoded_6. PPD NSL',
               'WCIO Cause of Injury Code_encoded_7. PTD',
               
               'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS',
               'WCIO Nature of Injury Code_encoded_3. MED ONLY',
               'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
               'WCIO Nature of Injury Code_encoded_1. CANCELLED',
               'WCIO Nature of Injury Code_encoded_8. DEATH',
               'WCIO Nature of Injury Code_encoded_6. PPD NSL',
               'WCIO Nature of Injury Code_encoded_7. PTD',
               
               'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
               'WCIO Part Of Body Code_encoded_3. MED ONLY',
               'WCIO Part Of Body Code_encoded_4. TEMPORARY',
               'WCIO Part Of Body Code_encoded_1. CANCELLED',
               'WCIO Part Of Body Code_encoded_8. DEATH',
               'WCIO Part Of Body Code_encoded_6. PPD NSL',
               'WCIO Part Of Body Code_encoded_7. PTD',
               
               'Industry Code_freq',
               'WCIO Cause of Injury Code_freq', 
               'WCIO Nature of Injury Code_freq',
               'WCIO Part Of Body Code_freq', 
               'Carrier Type Imputed_freq',
               'Carrier Name_freq',
               
               'Accident Datemonth',
               'Accident Date_Season_Spring', 
               'Accident Date_Season_Summer',
               'Accident Date_Season_Winter',

               'Days_between_Assembly Date_Accident Date_log',
               'Days_between_C-2 Date Imputed_Accident Date_log',
               'Average Weekly Wage Imputed_log']

cat_columns =['Carrier Type_Self-insured Private Entity',
               'Carrier Type_Self-insured Public Entity', 
               'Carrier Type_Special Funds',
               'Carrier Type_State Insurance Fund', 
               'Carrier Type_nan',
               'C-3 Date_nabinary', 
               'Average Weekly Wage_nabinary',
               'First Hearing Date_nabinary',
               'Alternative Dispute Resolution_binary',
               'COVID-19 Indicator_binary',
               'Attorney/Representative_binary']


# Create subsets
X_train_num = X_train_encoded[num_columns]
X_train_cat = X_train_encoded[cat_columns]

X_val_num = X_val_encoded[num_columns]
X_val_cat = X_val_encoded[cat_columns]

X_test_num=X_test_encoded[num_columns]
X_test_cat=X_test_encoded[cat_columns]

## 3. Scaling

### 3.1 Scale non-binary

In [403]:
scaler = MinMaxScaler().fit(X_train_num)
X_train_num_scaled = scaler.transform(X_train_num)
# print("Parameters fitted:")
# for feature, min_val, max_val in zip(X_train_num.columns, scaler.data_min_, scaler.data_max_):
    # print(f"Variable: {feature} | Min: {min_val} | Max: {max_val}")

# Convert the array to a pandas dataframe
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns = X_train_num.columns).set_index(X_train_encoded.index)
# X_train_num_scaled.describe().round(2)

In [404]:
X_val_num_scaled = scaler.transform(X_val_num)
X_val_num_scaled = pd.DataFrame(X_val_num_scaled, columns = X_val_num.columns).set_index(X_val_encoded.index)
# X_val_num_scaled.describe().round(2)

In [405]:
X_test_num_scaled = scaler.transform(X_test_num)
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns = X_test_num.columns).set_index(X_test_encoded.index)
# X_test_num_scaled.describe().round(2)

### 3.2 Scale binary

In [406]:
scaler = MinMaxScaler().fit(X_train_num_bin)
X_train_num_scaled_bin = scaler.transform(X_train_num_bin)
# print("Parameters fitted:")
# for feature, min_val, max_val in zip(X_train_num.columns, scaler.data_min_, scaler.data_max_):
    # print(f"Variable: {feature} | Min: {min_val} | Max: {max_val}")

# Convert the array to a pandas dataframe
X_train_num_scaled_bin = pd.DataFrame(X_train_num_scaled_bin, columns = X_train_num_bin.columns).set_index(X_train_encoded_bin.index)
# X_train_num_scaled.describe().round(2)

In [407]:
X_val_num_scaled_bin = scaler.transform(X_val_num_bin)
X_val_num_scaled_bin = pd.DataFrame(X_val_num_scaled_bin, columns = X_val_num_bin.columns).set_index(X_val_encoded_bin.index)
# X_val_num_scaled.describe().round(2)

In [408]:
X_test_num_scaled_bin = scaler.transform(X_test_num_bin)
X_test_num_scaled_bin = pd.DataFrame(X_test_num_scaled_bin, columns = X_test_num_bin.columns).set_index(X_test_encoded_bin.index)
# X_test_num_scaled.describe().round(2)

## 4. Binary: Feature selection

><strong style="color:#6fa8dc">Add this to a separate python file</strong>


### 4.1 Filter Categorical Features

In [409]:
# squeeze the y_train_bin
y_train_bin = y_train_bin.squeeze()

#### 4.1.1 Chi-square

In [410]:
import pandas as pd  # For creating and handling DataFrames
import numpy as np  # For numerical operations
from sklearn.feature_selection import chi2  # For Chi-square test
from scipy.stats import chi2_contingency  # For Cramér's V calculation

In [411]:
def cramers_v(X, y):
    """
    Calculate Cramér's V for a categorical feature and a target.
    :param X: Categorical feature (Pandas Series or array-like)
    :param y: Target variable (Pandas Series or array-like)
    :return: Cramér's V value
    """
    # Create the contingency table
    df_contingency = pd.crosstab(X, y)

    # Perform chi-square test
    chi2, p, dof, expected = chi2_contingency(df_contingency.values)

    # Calculate Cramér's V
    n = df_contingency.sum().sum()  # Total number of observations
    min_dim = min(df_contingency.shape) - 1  # Min between number of rows and columns - 1
    cramers_v = np.sqrt(chi2 / (n * min_dim))  # Cramér's V formula

    return cramers_v


In [412]:
# Perform Chi-square test
chi2_values, p_values = chi2(X_train_cat_bin, y_train_bin)

# Create DataFrame for Chi-square results
chi2_results = pd.DataFrame({
    'Column': X_train_cat_bin.columns,
    'Chi2': chi2_values.round(5),
    'p-value': p_values.round(5)
})

# Calculate Cramér's V for binary target
cramers_v_values = []
for var in X_train_cat_bin.columns:
    v = cramers_v(X_train_cat_bin[var], y_train_bin)
    cramers_v_values.append(v)

# Add Cramér's V to DataFrame
chi2_results['Cramér\'s V'] = cramers_v_values

# Filter important features
chi2_important_features = chi2_results[(chi2_results['p-value'] < 0.05) & (chi2_results['Cramér\'s V'] >= 0.1)]

list_features_chi2_cramer = chi2_important_features['Column'].values

print(chi2_important_features)

                               Column         Chi2  p-value  Cramér's V
3   Carrier Type_State Insurance Fund   4215.72148      0.0    0.114906
5                   C-3 Date_nabinary  23953.69374      0.0    0.429801
6        Average Weekly Wage_nabinary  86212.24203      0.0    0.770968
7         First Hearing Date_nabinary  25813.45742      0.0    0.498054
10     Attorney/Representative_binary  78009.76607      0.0    0.535952


#### 4.1.2 Mutual Information

In [413]:
from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information for categorical features
mi_scores = mutual_info_classif(X_train_cat_bin, y_train_bin, discrete_features=True)

# Create DataFrame of results
mi_results = pd.DataFrame({
    'Feature': X_train_cat_bin.columns,
    'Mutual Information': mi_scores
}).sort_values(by='Mutual Information', ascending=False)

mi_results


Unnamed: 0,Feature,Mutual Information
6,Average Weekly Wage_nabinary,0.370264
10,Attorney/Representative_binary,0.157423
7,First Hearing Date_nabinary,0.138322
5,C-3 Date_nabinary,0.096962
3,Carrier Type_State Insurance Fund,0.006636
8,Alternative Dispute Resolution_binary,0.002901
9,COVID-19 Indicator_binary,0.00282
0,Carrier Type_Self-insured Private Entity,0.000811
4,Carrier Type_nan,0.000742
1,Carrier Type_Self-insured Public Entity,0.000223


In [414]:

mi_important_features = mi_results[mi_results['Mutual Information'] > 0.05]

list_features_mi = mi_results[mi_results['Mutual Information'] > 0.05]['Feature'].values

print(mi_important_features)

                           Feature  Mutual Information
6     Average Weekly Wage_nabinary            0.370264
10  Attorney/Representative_binary            0.157423
7      First Hearing Date_nabinary            0.138322
5                C-3 Date_nabinary            0.096962


#### 4.1.3 Filtered Categorical Features

In [415]:
# Select features that appear in at least one of the important feature sets
selected_features = list(set(list_features_chi2_cramer) | set(list_features_mi)) 
selected_features

['Attorney/Representative_binary',
 'Carrier Type_State Insurance Fund',
 'First Hearing Date_nabinary',
 'C-3 Date_nabinary',
 'Average Weekly Wage_nabinary']

In [416]:
# Keep only selected important features in the datasets
X_train_cat_filtered_bin = X_train_cat_bin[selected_features]
X_val_cat_filtered_bin = X_val_cat_bin[selected_features]
X_test_cat_filtered_bin = X_test_cat_bin[selected_features]

### 4.2 Filter Numerical Features

#### 4.2.1 Univariate variance

In [417]:
X_train_num_scaled_bin.var().sort_values(ascending=False)

Accident Date_Season_Summer                        0.190598
Accident Date_Season_Winter                        0.190024
Accident Date_Season_Spring                        0.181228
Average Weekly Wage Imputed_log                    0.158004
Carrier Name_freq                                  0.142667
WCIO Nature of Injury Code_freq                    0.136735
Industry Code_freq                                 0.123001
Number of Dependents                               0.111300
Carrier Type Imputed_freq                          0.103530
Accident Datemonth                                 0.099293
WCIO Part Of Body Code_freq                        0.098325
Industry Code_encoded_1                            0.096085
Industry Code_encoded_0                            0.096085
WCIO Cause of Injury Code_freq                     0.080853
WCIO Part Of Body Code_encoded_1                   0.057635
WCIO Part Of Body Code_encoded_0                   0.057635
Age at Injury                           

> None with variance zero

#### 4.2.2 Spearman Correlation

In [418]:
# Step 1: Calculate the Spearman correlation matrix for numerical features
cor_spearman = X_train_num_scaled_bin.corr(method='spearman')

# Step 2: Flatten the correlation matrix and reset the index
correlation_pairs = cor_spearman.unstack().reset_index()

# Step 3: Rename columns for clarity
correlation_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']

# Step 4: Filter the table for correlations > 0.8 or < -0.8 and exclude self-correlations (diagonal)
strong_correlations = correlation_pairs[
    ((correlation_pairs['Correlation'] > 0.8) | (correlation_pairs['Correlation'] < -0.8)) & 
    (correlation_pairs['Feature_1'] != correlation_pairs['Feature_2'])
]

# Step 5: Remove duplicate pairs by keeping only one order
strong_correlations = strong_correlations[
    strong_correlations['Feature_1'] < strong_correlations['Feature_2']
]

# Step 6: Sort by correlation value
strong_correlations = strong_correlations.sort_values(by='Correlation', ascending=False)

# Step 7: Display the table
strong_correlations


Unnamed: 0,Feature_1,Feature_2,Correlation
526,Days_between_Assembly Date_Accident Date_log,Days_between_C-2 Date Imputed_Accident Date_log,0.954048
76,Industry Code_encoded_0,Industry Code_encoded_1,-1.0
126,WCIO Cause of Injury Code_encoded_0,WCIO Cause of Injury Code_encoded_1,-1.0
176,WCIO Nature of Injury Code_encoded_0,WCIO Nature of Injury Code_encoded_1,-1.0
226,WCIO Part Of Body Code_encoded_0,WCIO Part Of Body Code_encoded_1,-1.0


> To be expected since it's binary.

#### 4.2.3 Mutual Information

In [419]:
from sklearn.feature_selection import mutual_info_classif

# Compute mutual information between categorical features and target
mutual_info = mutual_info_classif(X_train_num_scaled_bin, y_train_bin)

# Display features sorted by mutual information
mi_results = pd.DataFrame({
    'Feature': X_train_num_scaled_bin.columns,
    'Mutual Information': mutual_info
}).sort_values(by='Mutual Information', ascending=False)

mi_results

Unnamed: 0,Feature,Mutual Information
23,Average Weekly Wage Imputed_log,0.383246
1,IME-4 Count,0.158779
13,WCIO Nature of Injury Code_freq,0.057567
7,WCIO Nature of Injury Code_encoded_0,0.05289
8,WCIO Nature of Injury Code_encoded_1,0.05237
16,Carrier Name_freq,0.039518
12,WCIO Cause of Injury Code_freq,0.03904
6,WCIO Cause of Injury Code_encoded_1,0.038371
5,WCIO Cause of Injury Code_encoded_0,0.037967
14,WCIO Part Of Body Code_freq,0.034047


In [420]:
# Assuming `mutual_info` is the result from `mutual_info_classif` function
mi_results = pd.DataFrame({
    'Feature': X_train_num_scaled_bin.columns,
    'Mutual Information': mutual_info
}).sort_values(by='Mutual Information', ascending=False)

# Filter features with Mutual Information > 0.05
mi_important_features = mi_results[mi_results['Mutual Information'] < 0.01]

# Display the important features
print(mi_important_features)


                                            Feature  Mutual Information
20                      Accident Date_Season_Winter            0.008403
0                                     Age at Injury            0.007947
18                      Accident Date_Season_Spring            0.007731
19                      Accident Date_Season_Summer            0.007437
2                              Number of Dependents            0.005856
17                               Accident Datemonth            0.004799
21     Days_between_Assembly Date_Accident Date_log            0.003908
22  Days_between_C-2 Date Imputed_Accident Date_log            0.003875


#### 4.2.3 Filtered Numerical Features

In [421]:
# List of features to drop from X_train_num_scaled
num_features_to_drop = [
    'Days_between_C-2 Date Imputed_Accident Date_log',
    'Industry Code_encoded_0',
    'WCIO Cause of Injury Code_encoded_0',
    'WCIO Nature of Injury Code_encoded_0',
    'WCIO Part Of Body Code_encoded_0',
    'Accident Date_Season_Spring',
    'Accident Date_Season_Winter',
    'Accident Date_Season_Summer',
    'Number of Dependents',
    'Accident Datemonth'
    ]

# Drop features
X_train_num_scaled_filtered_bin = X_train_num_scaled_bin.drop(columns=num_features_to_drop)
X_val_num_scaled_filtered_bin = X_val_num_scaled_bin.drop(columns=num_features_to_drop)
X_test_num_scaled_filtered_bin = X_test_num_scaled_bin.drop(columns=num_features_to_drop)

# Verify the remaining columns
print("Remaining features in X_train_num_scaled:", X_train_num_scaled_filtered_bin.columns)


Remaining features in X_train_num_scaled: Index(['Age at Injury', 'IME-4 Count', 'Industry Code_encoded_1',
       'WCIO Cause of Injury Code_encoded_1',
       'WCIO Nature of Injury Code_encoded_1',
       'WCIO Part Of Body Code_encoded_1', 'Industry Code_freq',
       'WCIO Cause of Injury Code_freq', 'WCIO Nature of Injury Code_freq',
       'WCIO Part Of Body Code_freq', 'Carrier Type Imputed_freq',
       'Carrier Name_freq', 'Days_between_Assembly Date_Accident Date_log',
       'Average Weekly Wage Imputed_log'],
      dtype='object')


### 4.3 Combine the Filtered Datasets

In [422]:
# Combine the filtered datasets
X_train_bin = pd.concat([X_train_cat_filtered_bin, X_train_num_scaled_filtered_bin], axis=1)
X_val_bin =  pd.concat([X_val_cat_filtered_bin, X_val_num_scaled_filtered_bin], axis=1)
X_test_bin =  pd.concat([X_test_cat_filtered_bin, X_test_num_scaled_filtered_bin], axis=1)


# Verify the shape of the combined dataset
print("Shape of combined X_train:", X_train_bin.shape)


Shape of combined X_train: (396097, 19)


> Use this for the binary, as we will be dropping later for multiclass in minority

### 4.4 Feature Selection with All Features

#### 4.4.1 RFE WITH Logistic Regression

> Use 50% of dataset

In [423]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score
# from sklearn.feature_selection import RFE
# import numpy as np

# # Initialize the Logistic Regression model
# log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

# # Sample 1% of the data
# sample_size = int(0.5 * len(X_train_bin))  # 50% of the dataset
# X_train_sample = X_train_bin.sample(n=sample_size, random_state=42)
# y_train_sample = y_train_bin.sample(n=sample_size, random_state=42)

# # Loop over different numbers of features
# feature_counts = range(1, X_train_sample.shape[1] + 1)
# scores = []

# # Loop through each feature count and evaluate the model's performance
# for n in feature_counts:
#     rfe = RFE(log_reg_model, n_features_to_select=n)
#     rfe.fit(X_train_sample, y_train_sample)
    
#     # Get selected features
#     selected_features = X_train_sample.columns[rfe.support_]
    
#     # Evaluate performance using cross-validation with F1 score as the metric
#     score = cross_val_score(log_reg_model, X_train_sample[selected_features], y_train_sample, cv=5, scoring='f1_macro').mean()
#     scores.append(score)

# # Find the number of features that gives the highest F1 score
# best_n_features = feature_counts[np.argmax(scores)]
# print(f"Optimal number of features: {best_n_features}")


In [424]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Apply seaborn style for a cleaner look
# sns.set(style='whitegrid')

# # Define custom colors
# line_color = sns.color_palette("coolwarm", as_cmap=True)(0.2)  # Cool blue
# optimal_color = sns.color_palette("coolwarm", as_cmap=True)(0.8)  # Warm red

# # Plot the number of features against the F1 scores
# plt.figure(figsize=(10, 6))
# plt.plot(feature_counts, scores, marker='o', linestyle='-', color=line_color, label='F1 Score Curve', linewidth=2)

# # Highlight the optimal point
# plt.axvline(x=best_n_features, color=optimal_color, linestyle='--', label=f'Optimal: {best_n_features} features', linewidth=1.5)
# plt.scatter([best_n_features], [max(scores)], color=optimal_color, zorder=5, s=100, edgecolor='black')

# # Add labels, title, and legend
# plt.title('Number of Features vs F1 Score', fontsize=16, color="#333333")
# plt.xlabel('Number of Features', fontsize=12, color="#333333")
# plt.ylabel('F1 Score (macro)', fontsize=12, color="#333333")
# plt.legend(frameon=True, shadow=True, fontsize=12)
# plt.grid(alpha=0.3, linestyle='--')

# # Show the plot
# plt.tight_layout()
# plt.show()


In [425]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pandas as pd

# Assuming X_train_bin and y_train_bin are your features and target

# Step 1: Take a smaller sample (e.g., 10% of the data)
sample_size = int(0.5 * len(X_train_bin))  # 10% of the dataset
X_train_sample = X_train_bin.sample(n=sample_size, random_state=42)
y_train_sample = y_train_bin.sample(n=sample_size, random_state=42)

# Step 2: Initialize Logistic Regression
logistic_model = LogisticRegression(max_iter=1000, random_state=42)

# Step 3: Apply RFE to the sample data
rfe = RFE(logistic_model, n_features_to_select=11)  # Select top 8 features (adjust as needed)
rfe.fit(X_train_sample, y_train_sample)

# Step 4: Get the selected features
selected_features_rfe = X_train_sample.columns[rfe.support_]

# Print selected features
print("Selected Features after RFE with Logistic Regression:")
print(selected_features_rfe)

# Optionally, you can use the selected features to transform the dataset
X_train_selected_rfe = X_train_sample[selected_features_rfe]


Selected Features after RFE with Logistic Regression:
Index(['Attorney/Representative_binary', 'Carrier Type_State Insurance Fund',
       'First Hearing Date_nabinary', 'Average Weekly Wage_nabinary',
       'IME-4 Count', 'WCIO Cause of Injury Code_encoded_1',
       'WCIO Nature of Injury Code_encoded_1',
       'WCIO Part Of Body Code_encoded_1', 'WCIO Nature of Injury Code_freq',
       'Carrier Name_freq', 'Average Weekly Wage Imputed_log'],
      dtype='object')


#### 4.4.2 Lasso

In [426]:
from sklearn.linear_model import LassoCV
lasso = LassoCV(cv=5)  # Cross-validation to select optimal alpha
lasso.fit(X_train_bin, y_train_bin)
lasso_features = X_train_bin.columns[lasso.coef_ != 0]
print("Selected features by Lasso:", lasso_features)

Selected features by Lasso: Index(['Attorney/Representative_binary', 'Carrier Type_State Insurance Fund',
       'First Hearing Date_nabinary', 'C-3 Date_nabinary',
       'Average Weekly Wage_nabinary', 'Age at Injury', 'IME-4 Count',
       'Industry Code_encoded_1', 'WCIO Cause of Injury Code_encoded_1',
       'WCIO Nature of Injury Code_encoded_1',
       'WCIO Part Of Body Code_encoded_1', 'Industry Code_freq',
       'WCIO Cause of Injury Code_freq', 'WCIO Nature of Injury Code_freq',
       'WCIO Part Of Body Code_freq', 'Carrier Type Imputed_freq',
       'Carrier Name_freq', 'Days_between_Assembly Date_Accident Date_log',
       'Average Weekly Wage Imputed_log'],
      dtype='object')


#### 4.4.3 Ridge

In [427]:
from sklearn.linear_model import RidgeCV
import numpy as np

# Initialize Ridge regression model with cross-validation
ridge_model = RidgeCV(cv=5)

# Fit the model on the training data
ridge_model.fit(X_train_bin, y_train_bin)

# Get the feature coefficients
coefficients = ridge_model.coef_

# Set a threshold to select features (e.g., absolute coefficient > 0.01)
threshold = 0.01
ridge_features = X_train_bin.columns[np.abs(coefficients) > threshold]

print(f"Selected features using Ridge regression: {ridge_features}")


Selected features using Ridge regression: Index(['Attorney/Representative_binary', 'Carrier Type_State Insurance Fund',
       'First Hearing Date_nabinary', 'C-3 Date_nabinary',
       'Average Weekly Wage_nabinary', 'IME-4 Count',
       'Industry Code_encoded_1', 'WCIO Cause of Injury Code_encoded_1',
       'WCIO Nature of Injury Code_encoded_1',
       'WCIO Part Of Body Code_encoded_1', 'Industry Code_freq',
       'WCIO Cause of Injury Code_freq', 'WCIO Nature of Injury Code_freq',
       'Carrier Type Imputed_freq', 'Carrier Name_freq',
       'Days_between_Assembly Date_Accident Date_log',
       'Average Weekly Wage Imputed_log'],
      dtype='object')


#### 4.4.4 Random Forest Important Features

In [428]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=47)

# Fit the model on X_train and y_train_encoded
rf_model.fit(X_train_bin, y_train_bin)

# Get feature importances
importances = rf_model.feature_importances_

# Sort feature importances in descending order
important_indices = importances.argsort()[::-1]

# Print the feature importances
print("Feature importances:", importances)
print("Sorted feature indices:", important_indices)


Feature importances: [0.05881585 0.00220738 0.04942971 0.02412719 0.23847265 0.05811821
 0.08010243 0.01907469 0.02881951 0.01990763 0.02730619 0.01810478
 0.02355741 0.01296906 0.02333782 0.00760047 0.04093534 0.05613349
 0.21098017]
Sorted feature indices: [ 4 18  6  0  5 17  2 16  8 10  3 12 14  9  7 11 13 15  1]


In [429]:
import numpy as np

# Calculate cumulative sum of feature importances
cumulative_importance = np.cumsum(importances[important_indices])

# Find the index where cumulative importance exceeds 90%
threshold = 0.90
index_90 = np.argmax(cumulative_importance >= threshold)

# The number of features that explain 90% of the importance
num_features_90 = index_90 + 1  # Adding 1 since index starts at 0

print(f"Number of features explaining 90% of cumulative importance: {num_features_90}")

# Select the top features based on this number
random_forest_features = X_train_bin.columns[important_indices[:num_features_90]]
print(f"Selected features that explain 90% of cumulative importance: {random_forest_features}")


Number of features explaining 90% of cumulative importance: 13
Selected features that explain 90% of cumulative importance: Index(['Average Weekly Wage_nabinary', 'Average Weekly Wage Imputed_log',
       'IME-4 Count', 'Attorney/Representative_binary', 'Age at Injury',
       'Days_between_Assembly Date_Accident Date_log',
       'First Hearing Date_nabinary', 'Carrier Name_freq',
       'WCIO Cause of Injury Code_encoded_1',
       'WCIO Part Of Body Code_encoded_1', 'C-3 Date_nabinary',
       'WCIO Cause of Injury Code_freq', 'WCIO Part Of Body Code_freq'],
      dtype='object')


#### 4.4.5 Voting of Best Features

In [430]:
# Convert all feature sets to sets
rfe_set = set(selected_features_rfe)
lasso_set = set(lasso_features)
ridge_set = set(ridge_features)
rf_set = set(random_forest_features)

# Find features selected by at least three methods
final_features_binary_set = (
    (lasso_set & rfe_set & ridge_set) |
    (lasso_set & ridge_set & rf_set) |
    (ridge_set & rfe_set & rf_set)
)

# Convert the final features set to a list (optional, for easier use later)
final_features_binary = list(final_features_binary_set)

# Print the selected features
print(len(final_features_binary))
print("Features selected by at least three methods:")
final_features_binary


14
Features selected by at least three methods:


['First Hearing Date_nabinary',
 'WCIO Nature of Injury Code_freq',
 'WCIO Nature of Injury Code_encoded_1',
 'WCIO Part Of Body Code_encoded_1',
 'C-3 Date_nabinary',
 'WCIO Cause of Injury Code_freq',
 'Days_between_Assembly Date_Accident Date_log',
 'Average Weekly Wage_nabinary',
 'Attorney/Representative_binary',
 'IME-4 Count',
 'Carrier Name_freq',
 'Average Weekly Wage Imputed_log',
 'Carrier Type_State Insurance Fund',
 'WCIO Cause of Injury Code_encoded_1']

In [431]:
X_train_bin_filter = X_train_bin[final_features_binary]
X_val_bin_filter = X_val_bin[final_features_binary]
X_test_bin_filter = X_test_bin[final_features_binary]

## 5. Binary: Model

In [432]:
def custom_loss(y_pred, dtrain):
    """
    Custom loss function penalizing false negatives for class 0.
    Parameters:
    - y_pred: Predicted values (log-odds).
    - dtrain: DMatrix containing true labels.

    Returns:
    - grad: Gradient of the loss.
    - hess: Hessian of the loss.
    """
    y_true = dtrain.get_label()  # Extract true labels from DMatrix
    y_pred = 1 / (1 + np.exp(-y_pred))  # Convert log-odds to probabilities

    # Adjust weights for class 0
    weight = np.where(y_true == 0, 2.0, 1.0)  # Double penalty for class 0 FN

    grad = (y_pred - y_true) * weight  # Gradient
    hess = y_pred * (1 - y_pred) * weight  # Hessian
    return grad, hess


In [433]:
import xgboost as xgb

# Convert data to DMatrix format
dtrain = xgb.DMatrix(X_train_bin_filter, label=y_train_bin)
dval = xgb.DMatrix(X_val_bin_filter, label=y_val_bin)

# Parameters for XGBoost
params = {
    "objective": "binary:logistic",  # Use binary classification
    "eval_metric": "logloss",  # Standard evaluation metric
    "eta": 0.1,  # Learning rate
    "max_depth": 6,  # Maximum depth of trees
}

# Train the model with custom loss function
model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    obj=custom_loss,  # Custom loss function
    evals=[(dval, "validation")],
    early_stopping_rounds=10,  # Stop if no improvement
)


[0]	validation-logloss:0.63551
[1]	validation-logloss:0.58788
[2]	validation-logloss:0.54808
[3]	validation-logloss:0.51451
[4]	validation-logloss:0.48577
[5]	validation-logloss:0.46122
[6]	validation-logloss:0.43997
[7]	validation-logloss:0.42150
[8]	validation-logloss:0.40544
[9]	validation-logloss:0.39131
[10]	validation-logloss:0.37892
[11]	validation-logloss:0.36803
[12]	validation-logloss:0.35837
[13]	validation-logloss:0.34977
[14]	validation-logloss:0.34221
[15]	validation-logloss:0.33547
[16]	validation-logloss:0.32947
[17]	validation-logloss:0.32397
[18]	validation-logloss:0.31915
[19]	validation-logloss:0.31477
[20]	validation-logloss:0.31099
[21]	validation-logloss:0.30748
[22]	validation-logloss:0.30421
[23]	validation-logloss:0.30148
[24]	validation-logloss:0.29898
[25]	validation-logloss:0.29681
[26]	validation-logloss:0.29479
[27]	validation-logloss:0.29285
[28]	validation-logloss:0.29125
[29]	validation-logloss:0.28962
[30]	validation-logloss:0.28827
[31]	validation-lo

In [434]:
# Predict probabilities
y_proba = model.predict(xgb.DMatrix(X_val_bin_filter))

# Convert probabilities to binary predictions using a threshold (default is 0.5)
threshold = 0.5
y_pred_binary = (y_proba > threshold).astype(int)

# Evaluate
from sklearn.metrics import classification_report, accuracy_score, f1_score

print("Accuracy:", accuracy_score(y_val_bin, y_pred_binary))
print("F1 Score:", f1_score(y_val_bin, y_pred_binary, average="macro"))
print(classification_report(y_val_bin, y_pred_binary))


Accuracy: 0.8938037166085947
F1 Score: 0.8936349625569049
              precision    recall  f1-score   support

           0       0.91      0.87      0.89     84881
           1       0.88      0.92      0.90     87319

    accuracy                           0.89    172200
   macro avg       0.90      0.89      0.89    172200
weighted avg       0.89      0.89      0.89    172200



In [435]:
import xgboost as xgb

binary_model = xgb.XGBClassifier(eval_metric="mlogloss")
binary_model.fit(X_train_bin_filter, y_train_bin)

# 3. Make predictions
y_pred_binary = binary_model.predict(X_val_bin_filter)  # Assuming X_test is available


# Evaluate the model
accuracy = accuracy_score(y_val_bin, y_pred_binary)
f1_macro = f1_score(y_val_bin, y_pred_binary, average='macro')  # Macro-averaged F1 score

# Print individual model results
print(f"XGBClassifier Accuracy: {accuracy:.4f}")
print(f"XGBClassifier Macro F1 Score: {f1_macro:.4f}")
print(classification_report(y_val_bin, y_pred_binary))  # Detailed report including precision, recall, and F1 score per class
print("-" * 50)

XGBClassifier Accuracy: 0.9011
XGBClassifier Macro F1 Score: 0.9005
              precision    recall  f1-score   support

           0       0.96      0.83      0.89     84881
           1       0.86      0.97      0.91     87319

    accuracy                           0.90    172200
   macro avg       0.91      0.90      0.90    172200
weighted avg       0.91      0.90      0.90    172200

--------------------------------------------------


## 6. Multiclass: Feature Selection

### 6.1 Filter Categorical Features

#### 6.1.1 Chi-square & Cramer's V

In [436]:
# Perform Chi-square test
chi2_values, p_values = chi2(X_train_cat, y_train_not_encoded)

# Create DataFrame for Chi-square results
chi2_results = pd.DataFrame({
    'Column': X_train_cat.columns,
    'Chi2': chi2_values.round(5),
    'p-value': p_values.round(5)
})

# Calculate Cramér's V for binary target
cramers_v_values = []
for var in X_train_cat.columns:
    v = cramers_v(X_train_cat[var], y_train_not_encoded)
    cramers_v_values.append(v)

# Add Cramér's V to DataFrame
chi2_results['Cramér\'s V'] = cramers_v_values

# Filter important features
chi2_important_features = chi2_results[(chi2_results['p-value'] < 0.05) & (chi2_results['Cramér\'s V'] >= 0.1)]

list_features_chi2_cramer = chi2_important_features['Column'].values

print(chi2_important_features)

                                     Column         Chi2  p-value  Cramér's V
1   Carrier Type_Self-insured Public Entity   5516.55597      0.0    0.191376
4                          Carrier Type_nan   2124.54066      0.0    0.104733
5                         C-3 Date_nabinary   8360.73548      0.0    0.284364
6              Average Weekly Wage_nabinary  60067.51252      0.0    0.644682
7               First Hearing Date_nabinary   7436.79142      0.0    0.280781
9                 COVID-19 Indicator_binary   2496.31072      0.0    0.115320
10           Attorney/Representative_binary  13651.29568      0.0    0.403996


#### 6.1.2 Mutual Information

In [437]:
from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information for categorical features
mi_scores = mutual_info_classif(X_train_cat, y_train_not_encoded, discrete_features=True)

# Create DataFrame of results
mi_results = pd.DataFrame({
    'Feature': X_train_cat.columns,
    'Mutual Information': mi_scores
}).sort_values(by='Mutual Information', ascending=False)

mi_results

mi_important_features = mi_results[mi_results['Mutual Information'] > 0.05]

list_features_mi = mi_results[mi_results['Mutual Information'] > 0.05]['Feature'].values

print(mi_important_features)

                           Feature  Mutual Information
6     Average Weekly Wage_nabinary            0.203587
10  Attorney/Representative_binary            0.092214


#### 6.1.3 Filtered Categorical Features

In [438]:
# Select features that appear in at least one of the important feature sets
selected_features = list(set(list_features_chi2_cramer) | set(list_features_mi)) 
print(selected_features)

# Keep only selected important features in the datasets
X_train_cat_filtered = X_train_cat[selected_features]
X_val_cat_filtered = X_val_cat[selected_features]
X_test_cat_filtered = X_test_cat[selected_features]

['Attorney/Representative_binary', 'COVID-19 Indicator_binary', 'Carrier Type_nan', 'Carrier Type_Self-insured Public Entity', 'First Hearing Date_nabinary', 'C-3 Date_nabinary', 'Average Weekly Wage_nabinary']


In [439]:
X_val_cat_filtered

Unnamed: 0_level_0,Attorney/Representative_binary,COVID-19 Indicator_binary,Carrier Type_nan,Carrier Type_Self-insured Public Entity,First Hearing Date_nabinary,C-3 Date_nabinary,Average Weekly Wage_nabinary
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6092487,1,0,0.0,0.0,0,0,0
5748532,1,0,0.0,0.0,0,0,1
5865707,1,0,0.0,0.0,0,0,1
5452724,1,0,0.0,1.0,0,0,0
5532305,0,0,0.0,1.0,1,1,0
...,...,...,...,...,...,...,...
5459066,0,0,0.0,0.0,1,1,1
5849458,0,0,0.0,0.0,1,1,1
5917935,0,0,0.0,0.0,0,1,0
5656444,0,1,0.0,0.0,1,1,1


### 6.2 Filter Numerical

#### 6.2.1 Univariate variance

In [440]:
X_train_num_scaled.var().sort_values(ascending=False)

Accident Date_Season_Summer                           0.192556
Accident Date_Season_Winter                           0.189182
Accident Date_Season_Spring                           0.182443
Carrier Name_freq                                     0.165006
WCIO Nature of Injury Code_freq                       0.140806
Industry Code_encoded_5. PPD SCH LOSS                 0.125896
Industry Code_freq                                    0.122103
Number of Dependents                                  0.111038
Average Weekly Wage Imputed_log                       0.107678
WCIO Part Of Body Code_freq                           0.104240
Carrier Type Imputed_freq                             0.100288
Accident Datemonth                                    0.098247
WCIO Cause of Injury Code_freq                        0.083633
Industry Code_encoded_4. TEMPORARY                    0.068127
WCIO Part Of Body Code_encoded_5. PPD SCH LOSS        0.040425
WCIO Nature of Injury Code_encoded_4. TEMPORARY       0

> No features with variance zero. Don't drop any here.

#### 6.2.2 Spearman Correlation

In [441]:
# Step 1: Calculate the Spearman correlation matrix for numerical features
cor_spearman = X_train_num_scaled.corr(method='spearman')

# Step 2: Flatten the correlation matrix and reset the index
correlation_pairs = cor_spearman.unstack().reset_index()

# Step 3: Rename columns for clarity
correlation_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']

# Step 4: Filter the table for correlations > 0.8 or < -0.8 and exclude self-correlations (diagonal)
strong_correlations = correlation_pairs[
    ((correlation_pairs['Correlation'] > 0.9) | (correlation_pairs['Correlation'] < -0.9)) & 
    (correlation_pairs['Feature_1'] != correlation_pairs['Feature_2'])
]

# Step 5: Remove duplicate pairs by keeping only one order
strong_correlations = strong_correlations[
    strong_correlations['Feature_1'] < strong_correlations['Feature_2']
]

# Step 6: Sort by correlation value
strong_correlations = strong_correlations.sort_values(by='Correlation', ascending=False)

# Step 7: Display the table
strong_correlations


Unnamed: 0,Feature_1,Feature_2,Correlation
1846,Days_between_Assembly Date_Accident Date_log,Days_between_C-2 Date Imputed_Accident Date_log,0.930421


#### 6.2.3 Mutual Information

In [442]:
from sklearn.feature_selection import mutual_info_classif

# Compute mutual information between categorical features and target
mutual_info = mutual_info_classif(X_train_num_scaled, y_train_encoded)

# Display features sorted by mutual information
mi_results = pd.DataFrame({
    'Feature': X_train_num_scaled.columns,
    'Mutual Information': mutual_info
}).sort_values(by='Mutual Information', ascending=False)

mi_results

Unnamed: 0,Feature,Mutual Information
43,Average Weekly Wage Imputed_log,0.279048
24,WCIO Part Of Body Code_encoded_5. PPD SCH LOSS,0.14734
25,WCIO Part Of Body Code_encoded_3. MED ONLY,0.147236
29,WCIO Part Of Body Code_encoded_6. PPD NSL,0.145448
34,WCIO Part Of Body Code_freq,0.145205
26,WCIO Part Of Body Code_encoded_4. TEMPORARY,0.145204
27,WCIO Part Of Body Code_encoded_1. CANCELLED,0.1448
30,WCIO Part Of Body Code_encoded_7. PTD,0.130367
1,IME-4 Count,0.124622
33,WCIO Nature of Injury Code_freq,0.123134


In [443]:
# Assuming `mutual_info` is the result from `mutual_info_classif` function
mi_results = pd.DataFrame({
    'Feature': X_train_num_scaled.columns,
    'Mutual Information': mutual_info
}).sort_values(by='Mutual Information', ascending=False)

# Filter features with Mutual Information > 0.05
mi_important_features = mi_results[mi_results['Mutual Information'] > 0.01]

# Display the important features
print(mi_important_features)


                                              Feature  Mutual Information
43                    Average Weekly Wage Imputed_log            0.279048
24     WCIO Part Of Body Code_encoded_5. PPD SCH LOSS            0.147340
25         WCIO Part Of Body Code_encoded_3. MED ONLY            0.147236
29          WCIO Part Of Body Code_encoded_6. PPD NSL            0.145448
34                        WCIO Part Of Body Code_freq            0.145205
26        WCIO Part Of Body Code_encoded_4. TEMPORARY            0.145204
27        WCIO Part Of Body Code_encoded_1. CANCELLED            0.144800
30              WCIO Part Of Body Code_encoded_7. PTD            0.130367
1                                         IME-4 Count            0.124622
33                    WCIO Nature of Injury Code_freq            0.123134
17  WCIO Nature of Injury Code_encoded_5. PPD SCH ...            0.120770
18     WCIO Nature of Injury Code_encoded_3. MED ONLY            0.120672
19    WCIO Nature of Injury Code_encod

#### 6.2.3 Filtered Numerical Features

In [444]:
# List of features to drop from X_train_num_scaled
num_features_to_drop = [
    'Days_between_C-2 Date Imputed_Accident Date_log'
    ]

# Drop features
X_train_num_scaled_filtered = X_train_num_scaled.drop(columns=num_features_to_drop)
X_val_num_scaled_filtered = X_val_num_scaled.drop(columns=num_features_to_drop)
X_test_num_scaled_filtered = X_test_num_scaled.drop(columns=num_features_to_drop)

# Verify the remaining columns
print("Remaining features in X_train_num_scaled:", X_train_num_scaled_filtered.columns)
print(len(X_train_num_scaled_filtered.columns))


Remaining features in X_train_num_scaled: Index(['Age at Injury', 'IME-4 Count', 'Number of Dependents',
       'Industry Code_encoded_5. PPD SCH LOSS',
       'Industry Code_encoded_3. MED ONLY',
       'Industry Code_encoded_4. TEMPORARY',
       'Industry Code_encoded_1. CANCELLED', 'Industry Code_encoded_8. DEATH',
       'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Cause of Injury Code_encoded_8. DEATH',
       'WCIO Cause of Injury Code_encoded_6. PPD NSL',
       'WCIO Cause of Injury Code_encoded_7. PTD',
       'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Nature of Injury Code_encoded_3. MED ONLY',
       'WCIO Nature of Injury Code_encoded_4. TEMPORARY',
       'WCIO Nature of Injury Code_enco

### 6.3 Combine the Filtered Datasets

In [445]:
# Combine the filtered datasets
X_train = pd.concat([X_train_cat_filtered, X_train_num_scaled_filtered], axis=1)
X_val =  pd.concat([X_val_cat_filtered, X_val_num_scaled_filtered], axis=1)
X_test =  pd.concat([X_test_cat_filtered, X_test_num_scaled_filtered], axis=1)


# Verify the shape of the combined dataset
print("Shape of combined X_train:", X_train.shape)
print("Shape of combined X_train:", X_val.shape)


Shape of combined X_train: (193852, 50)
Shape of combined X_train: (84881, 50)


### 6.4 Feature Selection All Features

#### 6.4.1 Lasso

In [446]:
from sklearn.linear_model import LassoCV
lasso = LassoCV(cv=5)  # Cross-validation to select optimal alpha
lasso.fit(X_train, y_train_encoded)
lasso_features = X_train.columns[lasso.coef_ != 0]
print("Selected features by Lasso:", lasso_features)

Selected features by Lasso: Index(['Attorney/Representative_binary', 'COVID-19 Indicator_binary',
       'Carrier Type_Self-insured Public Entity',
       'First Hearing Date_nabinary', 'C-3 Date_nabinary',
       'Average Weekly Wage_nabinary', 'Age at Injury', 'IME-4 Count',
       'Industry Code_encoded_5. PPD SCH LOSS',
       'Industry Code_encoded_3. MED ONLY',
       'Industry Code_encoded_4. TEMPORARY',
       'Industry Code_encoded_1. CANCELLED', 'Industry Code_encoded_8. DEATH',
       'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Cause of Injury Code_encoded_6. PPD NSL',
       'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Nature of Injury Code_encoded_3. MED ONLY',
       'WCIO Nature of Injur

#### 6.4.2 Ridge

In [447]:
from sklearn.linear_model import RidgeCV
import numpy as np

# Initialize Ridge regression model with cross-validation
ridge_model = RidgeCV(cv=5)

# Fit the model on the training data
ridge_model.fit(X_train, y_train_encoded)

# Get the feature coefficients
coefficients = ridge_model.coef_

# Set a threshold to select features (e.g., absolute coefficient > 0.01)
threshold = 0.01
ridge_features = X_train.columns[np.abs(coefficients) > threshold]

print(f"Selected features using Ridge regression: {ridge_features}")


Selected features using Ridge regression: Index(['COVID-19 Indicator_binary', 'Carrier Type_nan',
       'Carrier Type_Self-insured Public Entity',
       'First Hearing Date_nabinary', 'C-3 Date_nabinary',
       'Average Weekly Wage_nabinary', 'Age at Injury', 'IME-4 Count',
       'Industry Code_encoded_5. PPD SCH LOSS',
       'Industry Code_encoded_3. MED ONLY',
       'Industry Code_encoded_4. TEMPORARY',
       'Industry Code_encoded_1. CANCELLED', 'Industry Code_encoded_8. DEATH',
       'Industry Code_encoded_6. PPD NSL', 'Industry Code_encoded_7. PTD',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Cause of Injury Code_encoded_8. DEATH',
       'WCIO Cause of Injury Code_encoded_6. PPD NSL',
       'WCIO Cause of Injury Code_encoded_7. PTD',
       'WCIO Nature of Injury Code_encoded

#### 6.4.3 Random Forest

In [448]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=47)

# Fit the model on X_train and y_train_encoded
rf_model.fit(X_train, y_train_encoded)

# Get feature importances
importances = rf_model.feature_importances_

# Sort feature importances in descending order
important_indices = importances.argsort()[::-1]

# Print the feature importances
print("Feature importances:", importances)
print("Sorted feature indices:", important_indices)


Feature importances: [3.40106303e-02 4.79142767e-04 6.15614943e-05 7.22100892e-03
 1.52625120e-02 1.11759856e-02 6.50063325e-02 4.68472175e-02
 4.06466399e-02 2.81599897e-02 1.56135115e-02 1.69256114e-02
 1.24057944e-02 9.37291212e-03 9.28358779e-03 9.30350470e-03
 9.16291695e-03 1.97172799e-02 2.03410026e-02 2.19110914e-02
 2.21560659e-02 1.35423137e-02 1.64606135e-02 1.27586263e-02
 1.22464164e-02 1.35112783e-02 1.52768403e-02 1.27338842e-02
 7.75873101e-03 1.03733420e-02 8.47907947e-03 2.47260432e-02
 1.84067779e-02 1.88734467e-02 1.76564291e-02 1.18863586e-02
 1.54768152e-02 1.01227520e-02 1.06265222e-02 1.54788692e-02
 8.78695948e-03 1.18146677e-02 1.00902155e-02 3.39529383e-02
 3.06575493e-02 6.78751063e-03 7.12958257e-03 6.64386131e-03
 5.21828578e-02 1.50494419e-01]
Sorted feature indices: [49  6 48  7  8  0 43 44  9 31 20 19 18 17 33 32 34 11 22 10 39 36 26  4
 21 25 23 27 12 24 35 41  5 38 29 37 42 13 15 14 16 40 30 28  3 46 45 47
  1  2]


In [449]:
import numpy as np

# Calculate cumulative sum of feature importances
cumulative_importance = np.cumsum(importances[important_indices])

# Find the index where cumulative importance exceeds 90%
threshold = 0.90
index_90 = np.argmax(cumulative_importance >= threshold)

# The number of features that explain 90% of the importance
num_features_90 = index_90 + 1  # Adding 1 since index starts at 0

print(f"Number of features explaining 90% of cumulative importance: {num_features_90}")

# Select the top features based on this number
random_forest_features = X_train.columns[important_indices[:num_features_90]]
print(f"Selected features that explain 90% of cumulative importance: {random_forest_features}")


Number of features explaining 90% of cumulative importance: 37
Selected features that explain 90% of cumulative importance: Index(['Average Weekly Wage Imputed_log', 'Average Weekly Wage_nabinary',
       'Days_between_Assembly Date_Accident Date_log', 'Age at Injury',
       'IME-4 Count', 'Attorney/Representative_binary', 'Carrier Name_freq',
       'Accident Datemonth', 'Number of Dependents',
       'WCIO Part Of Body Code_encoded_5. PPD SCH LOSS',
       'WCIO Cause of Injury Code_encoded_1. CANCELLED',
       'WCIO Cause of Injury Code_encoded_4. TEMPORARY',
       'WCIO Cause of Injury Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS',
       'WCIO Part Of Body Code_encoded_4. TEMPORARY',
       'WCIO Part Of Body Code_encoded_3. MED ONLY',
       'WCIO Part Of Body Code_encoded_1. CANCELLED',
       'Industry Code_encoded_3. MED ONLY',
       'WCIO Cause of Injury Code_encoded_6. PPD NSL',
       'Industry Code_encoded_5. PPD SCH LOSS',
     

In [450]:
import pandas as pd
from xgboost import XGBClassifier

# Initialize the XGBoost Classifier without 'use_label_encoder'
xgb_model = XGBClassifier(n_estimators=100, random_state=47, eval_metric="mlogloss")

# Fit the model on X_train and y_train_encoded
xgb_model.fit(X_train, y_train_encoded)

# Get feature importances
importances = xgb_model.feature_importances_

# Create a DataFrame with feature names and their importances
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,  # Assumes X_train is a pandas DataFrame
    'Importance': importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the sorted feature names and their importance
print(feature_importance_df)


                                              Feature  Importance
0                      Attorney/Representative_binary    0.182440
49                    Average Weekly Wage Imputed_log    0.106102
6                        Average Weekly Wage_nabinary    0.100935
18      WCIO Cause of Injury Code_encoded_3. MED ONLY    0.075978
31     WCIO Part Of Body Code_encoded_5. PPD SCH LOSS    0.063912
27    WCIO Nature of Injury Code_encoded_1. CANCELLED    0.048111
10              Industry Code_encoded_5. PPD SCH LOSS    0.042870
3             Carrier Type_Self-insured Public Entity    0.039795
4                         First Hearing Date_nabinary    0.037929
8                                         IME-4 Count    0.032051
25     WCIO Nature of Injury Code_encoded_3. MED ONLY    0.020884
26    WCIO Nature of Injury Code_encoded_4. TEMPORARY    0.017785
32         WCIO Part Of Body Code_encoded_3. MED ONLY    0.014614
24  WCIO Nature of Injury Code_encoded_5. PPD SCH ...    0.012353
20     WCI

In [451]:
import numpy as np
import pandas as pd

# Calculate cumulative sum of feature importances
cumulative_importance = np.cumsum(feature_importance_df['Importance'])

# Find the index where cumulative importance exceeds 90%
threshold = 0.90
index_90 = np.argmax(cumulative_importance >= threshold)

# The number of features that explain 90% of the importance
num_features_90 = index_90 + 1  # Adding 1 since index starts at 0

print(f"Number of features explaining 90% of cumulative importance: {num_features_90}")

# Select the top features based on this number
xgboost_features = feature_importance_df['Feature'].iloc[:num_features_90]
print(f"Selected features that explain 90% of cumulative importance:\n{xgboost_features}")


Number of features explaining 90% of cumulative importance: 26
Selected features that explain 90% of cumulative importance:
0                        Attorney/Representative_binary
49                      Average Weekly Wage Imputed_log
6                          Average Weekly Wage_nabinary
18        WCIO Cause of Injury Code_encoded_3. MED ONLY
31       WCIO Part Of Body Code_encoded_5. PPD SCH LOSS
27      WCIO Nature of Injury Code_encoded_1. CANCELLED
10                Industry Code_encoded_5. PPD SCH LOSS
3               Carrier Type_Self-insured Public Entity
4                           First Hearing Date_nabinary
8                                           IME-4 Count
25       WCIO Nature of Injury Code_encoded_3. MED ONLY
26      WCIO Nature of Injury Code_encoded_4. TEMPORARY
32           WCIO Part Of Body Code_encoded_3. MED ONLY
24    WCIO Nature of Injury Code_encoded_5. PPD SCH ...
20       WCIO Cause of Injury Code_encoded_1. CANCELLED
17    WCIO Cause of Injury Code_enco

#### 6.4.5 Voting for the Best Feature

In [452]:
# Convert all feature sets to sets
lasso_set = set(lasso_features)
ridge_set = set(ridge_features)
rf_set = set(random_forest_features)
xgb_set = set(xgboost_features)

# Find features selected by at least three methods
final_features_set = (
    (lasso_set & ridge_set & rf_set) |  # Features in all three: Lasso, Ridge, RF
    (lasso_set & ridge_set & xgb_set) |  # Features in Lasso, Ridge, XGBoost
    (lasso_set & rf_set & xgb_set) |     # Features in Lasso, RF, XGBoost
    (ridge_set & rf_set & xgb_set)      # Features in Ridge, RF, XGBoost
)

# Convert the final features set to a list (optional, for easier use later)
final_features = list(final_features_set)

# Print the selected features
print(len(final_features))
print("Features selected by at least three methods:")
print(final_features)


35
Features selected by at least three methods:
['Accident Datemonth', 'WCIO Part Of Body Code_freq', 'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS', 'Industry Code_encoded_5. PPD SCH LOSS', 'Industry Code_freq', 'C-3 Date_nabinary', 'WCIO Cause of Injury Code_freq', 'WCIO Cause of Injury Code_encoded_3. MED ONLY', 'WCIO Cause of Injury Code_encoded_6. PPD NSL', 'WCIO Nature of Injury Code_encoded_4. TEMPORARY', 'WCIO Part Of Body Code_encoded_3. MED ONLY', 'Carrier Name_freq', 'Average Weekly Wage Imputed_log', 'Industry Code_encoded_1. CANCELLED', 'COVID-19 Indicator_binary', 'Attorney/Representative_binary', 'WCIO Part Of Body Code_encoded_4. TEMPORARY', 'Carrier Type Imputed_freq', 'WCIO Cause of Injury Code_encoded_1. CANCELLED', 'Industry Code_encoded_4. TEMPORARY', 'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Nature of Injury Code_encoded_6. PPD NSL', 'WCIO Nature of Injury Code_encoded_3. MED ONLY', 'Carrier Type_Self-insured Public Entity', 'First Hearing D

In [453]:
import pandas as pd

# Assume the following sets represent the selected features from each method
lasso_set = set(lasso_features)
ridge_set = set(ridge_features)
rf_set = set(random_forest_features)
xgb_set = set(xgboost_features)

# Combine all unique features from all sets
all_features = list(set(lasso_set) | set(ridge_set) | set(rf_set) | set(xgb_set))

# Create a DataFrame to show the feature selection results
feature_selection_table = pd.DataFrame({
    'Feature': all_features,
    'Lasso': [feature in lasso_set for feature in all_features],
    'Ridge': [feature in ridge_set for feature in all_features],
    'Random Forest': [feature in rf_set for feature in all_features],
    'XGBoost': [feature in xgb_set for feature in all_features]
})

# Function to highlight the selected features
def highlight_selected(val):
    color = 'background-color: lightgreen' if val else ''
    return color

# Apply the styling
styled_table = feature_selection_table.style.applymap(highlight_selected, subset=['Lasso', 'Ridge', 'Random Forest', 'XGBoost'])

# Display the table with highlighted cells
styled_table


  styled_table = feature_selection_table.style.applymap(highlight_selected, subset=['Lasso', 'Ridge', 'Random Forest', 'XGBoost'])


Unnamed: 0,Feature,Lasso,Ridge,Random Forest,XGBoost
0,Accident Datemonth,True,True,True,False
1,WCIO Nature of Injury Code_encoded_8. DEATH,False,True,False,True
2,WCIO Cause of Injury Code_encoded_8. DEATH,False,True,True,False
3,Industry Code_encoded_5. PPD SCH LOSS,True,True,True,True
4,WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS,True,True,True,True
5,WCIO Part Of Body Code_freq,True,True,True,False
6,WCIO Nature of Injury Code_encoded_7. PTD,False,True,False,False
7,Industry Code_freq,True,True,True,False
8,C-3 Date_nabinary,True,True,True,True
9,WCIO Cause of Injury Code_freq,True,True,True,False


> Apply to datasets

In [454]:
import pandas as pd

# Assume the following sets represent the selected features from each method
lasso_set = set(lasso_features)
ridge_set = set(ridge_features)
rf_set = set(random_forest_features)
xgb_set = set(xgboost_features)

# Combine all unique features from all sets
all_features = list(set(lasso_set) | set(ridge_set) | set(rf_set) | set(xgb_set))

# Create a DataFrame to show the feature selection results
feature_selection_table = pd.DataFrame({
    'Feature': all_features,
    'Lasso': [feature in lasso_set for feature in all_features],
    'Ridge': [feature in ridge_set for feature in all_features],
    'Random Forest': [feature in rf_set for feature in all_features],
    'XGBoost': [feature in xgb_set for feature in all_features]
})

# Add a column to count how many methods selected each feature
feature_selection_table['Selected by 2 or more'] = (
    feature_selection_table[['Lasso', 'Ridge', 'Random Forest', 'XGBoost']].sum(axis=1) >= 3
)

# Function to highlight the selected features
# Function to highlight selected features for Lasso, Ridge, RF, and XGBoost
def highlight_selected(val):
    return 'background-color: #aed6f1' if val else ''

# Apply the styling with the updated 'map' method
styled_table = (
    feature_selection_table.style
    .map(highlight_selected, subset=['Lasso', 'Ridge', 'Random Forest', 'XGBoost'])
    .map(lambda val: 'background-color: #abebc6' if val else '', subset=['Selected by 2 or more'])
)

# Display the styled table
styled_table


# Display the table with highlighted cells
styled_table


Unnamed: 0,Feature,Lasso,Ridge,Random Forest,XGBoost,Selected by 2 or more
0,Accident Datemonth,True,True,True,False,True
1,WCIO Nature of Injury Code_encoded_8. DEATH,False,True,False,True,False
2,WCIO Cause of Injury Code_encoded_8. DEATH,False,True,True,False,False
3,Industry Code_encoded_5. PPD SCH LOSS,True,True,True,True,True
4,WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS,True,True,True,True,True
5,WCIO Part Of Body Code_freq,True,True,True,False,True
6,WCIO Nature of Injury Code_encoded_7. PTD,False,True,False,False,False
7,Industry Code_freq,True,True,True,False,True
8,C-3 Date_nabinary,True,True,True,True,True
9,WCIO Cause of Injury Code_freq,True,True,True,False,True


In [455]:
# Filter the DataFrame to get features selected by 4 methods (or any threshold)
selected_features = feature_selection_table[feature_selection_table['Selected by 2 or more']].Feature.tolist()

# Output the list
print("Features selected by 4 or more methods:")
print(selected_features)


Features selected by 4 or more methods:
['Accident Datemonth', 'Industry Code_encoded_5. PPD SCH LOSS', 'WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Part Of Body Code_freq', 'Industry Code_freq', 'C-3 Date_nabinary', 'WCIO Cause of Injury Code_freq', 'WCIO Cause of Injury Code_encoded_3. MED ONLY', 'WCIO Cause of Injury Code_encoded_6. PPD NSL', 'WCIO Nature of Injury Code_encoded_4. TEMPORARY', 'WCIO Part Of Body Code_encoded_3. MED ONLY', 'Carrier Name_freq', 'COVID-19 Indicator_binary', 'Average Weekly Wage Imputed_log', 'Industry Code_encoded_1. CANCELLED', 'Attorney/Representative_binary', 'WCIO Part Of Body Code_encoded_4. TEMPORARY', 'Carrier Type Imputed_freq', 'WCIO Cause of Injury Code_encoded_1. CANCELLED', 'Industry Code_encoded_4. TEMPORARY', 'WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS', 'WCIO Nature of Injury Code_encoded_6. PPD NSL', 'WCIO Nature of Injury Code_encoded_3. MED ONLY', 'Carrier Type_Self-insured Public Entity', 'First Hearing Date_nabi

In [456]:
X_train_filter = X_train[final_features]
X_val_filter = X_val[final_features]
X_test_filter = X_test[final_features]

### 7. Multiclass Model

In [457]:
X_val

Unnamed: 0_level_0,Attorney/Representative_binary,COVID-19 Indicator_binary,Carrier Type_nan,Carrier Type_Self-insured Public Entity,First Hearing Date_nabinary,C-3 Date_nabinary,Average Weekly Wage_nabinary,Age at Injury,IME-4 Count,Number of Dependents,Industry Code_encoded_5. PPD SCH LOSS,Industry Code_encoded_3. MED ONLY,Industry Code_encoded_4. TEMPORARY,Industry Code_encoded_1. CANCELLED,Industry Code_encoded_8. DEATH,Industry Code_encoded_6. PPD NSL,Industry Code_encoded_7. PTD,WCIO Cause of Injury Code_encoded_5. PPD SCH LOSS,WCIO Cause of Injury Code_encoded_3. MED ONLY,WCIO Cause of Injury Code_encoded_4. TEMPORARY,WCIO Cause of Injury Code_encoded_1. CANCELLED,WCIO Cause of Injury Code_encoded_8. DEATH,WCIO Cause of Injury Code_encoded_6. PPD NSL,WCIO Cause of Injury Code_encoded_7. PTD,WCIO Nature of Injury Code_encoded_5. PPD SCH LOSS,WCIO Nature of Injury Code_encoded_3. MED ONLY,WCIO Nature of Injury Code_encoded_4. TEMPORARY,WCIO Nature of Injury Code_encoded_1. CANCELLED,WCIO Nature of Injury Code_encoded_8. DEATH,WCIO Nature of Injury Code_encoded_6. PPD NSL,WCIO Nature of Injury Code_encoded_7. PTD,WCIO Part Of Body Code_encoded_5. PPD SCH LOSS,WCIO Part Of Body Code_encoded_3. MED ONLY,WCIO Part Of Body Code_encoded_4. TEMPORARY,WCIO Part Of Body Code_encoded_1. CANCELLED,WCIO Part Of Body Code_encoded_8. DEATH,WCIO Part Of Body Code_encoded_6. PPD NSL,WCIO Part Of Body Code_encoded_7. PTD,Industry Code_freq,WCIO Cause of Injury Code_freq,WCIO Nature of Injury Code_freq,WCIO Part Of Body Code_freq,Carrier Type Imputed_freq,Carrier Name_freq,Accident Datemonth,Accident Date_Season_Spring,Accident Date_Season_Summer,Accident Date_Season_Winter,Days_between_Assembly Date_Accident Date_log,Average Weekly Wage Imputed_log
Claim Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
6092487,1,0,0.0,0.0,0,0,0,0.246377,0.10,0.166667,0.325648,0.607705,0.887156,0.087878,0.311880,1.000000,0.177619,0.142540,0.829863,0.545164,0.030172,0.011075,0.500785,0.008305,0.190791,0.658129,0.569543,0.021734,0.000445,0.443558,0.002714,0.280192,0.789842,0.676447,0.031290,0.031275,0.235208,0.007241,0.319684,0.369088,1.000000,1.000000,0.185701,0.066229,0.727273,0.0,0.0,0.0,0.378197,0.645878
5748532,1,0,0.0,0.0,0,0,1,0.405797,0.00,0.666667,0.220886,0.452770,0.196958,0.183246,0.252957,0.478204,0.272116,0.001596,0.000000,0.000000,1.000000,0.000000,0.000000,0.005127,0.002344,0.000000,0.000000,1.000000,0.000000,0.000000,0.002133,0.003090,0.000000,0.000000,1.000000,0.000000,0.000000,0.017027,0.067017,0.298873,0.090889,0.212040,1.000000,0.002941,0.454545,0.0,1.0,0.0,0.187586,0.000000
5865707,1,0,0.0,0.0,0,0,1,0.144928,0.00,1.000000,0.050822,0.149968,0.166964,0.021219,0.117240,0.305365,0.201791,0.170538,0.648444,0.589521,0.021950,0.031958,0.306615,0.019172,0.324360,0.572409,0.874131,0.011247,0.004168,0.251867,0.011660,0.202964,0.607491,0.585281,0.015413,0.000000,0.031352,0.000000,0.375749,0.079749,0.149626,0.285806,1.000000,0.023387,0.909091,0.0,0.0,0.0,0.365712,0.000000
5452724,1,0,0.0,1.0,0,0,0,0.681159,0.25,0.833333,0.325648,0.607705,0.887156,0.087878,0.311880,1.000000,0.177619,0.224343,0.542353,0.452432,0.018585,0.003742,0.209599,0.008980,0.246658,0.603868,0.339123,0.017932,0.000368,0.256203,0.001870,0.280192,0.789842,0.676447,0.031290,0.031275,0.235208,0.007241,0.319684,0.682929,0.725605,1.000000,0.432260,0.074441,0.181818,1.0,0.0,0.0,0.103790,0.867564
5532305,0,0,0.0,1.0,1,1,0,0.811594,0.00,0.833333,1.000000,0.649200,0.266465,0.051450,0.185950,0.628388,0.282400,0.192797,0.579832,0.430519,0.013216,0.000000,0.183517,0.004880,0.324360,0.572409,0.874131,0.011247,0.004168,0.251867,0.011660,0.134445,0.475564,0.693979,0.008993,0.000000,0.084196,0.000000,0.809192,0.628303,0.149626,0.009919,0.432260,0.033967,0.545455,0.0,1.0,0.0,0.225217,0.771096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5459066,0,0,0.0,0.0,1,1,1,0.449275,0.00,0.666667,0.187309,0.332778,1.000000,0.289400,0.593106,0.636753,0.638027,0.171370,0.367137,0.312613,0.060649,0.012814,0.137062,0.019218,0.190791,0.658129,0.569543,0.021734,0.000445,0.443558,0.002714,0.218571,0.612923,0.499451,0.014605,0.009797,0.078458,0.000000,0.266453,0.478588,1.000000,0.176737,1.000000,0.003123,0.181818,1.0,0.0,0.0,0.301707,0.000000
5849458,0,0,0.0,0.0,1,1,1,0.594203,0.00,0.833333,1.000000,0.649200,0.266465,0.051450,0.185950,0.628388,0.282400,0.165853,0.554814,0.503899,0.019108,0.000256,0.276374,0.009200,0.190791,0.658129,0.569543,0.021734,0.000445,0.443558,0.002714,0.433780,0.587264,0.542123,0.019657,0.000000,0.062290,0.000000,0.809192,1.000000,1.000000,0.150690,0.391727,1.000000,0.818182,0.0,0.0,0.0,0.400421,0.000000
5917935,0,0,0.0,0.0,0,1,0,0.362319,0.05,0.333333,0.085646,0.254992,0.477819,0.038394,0.147631,0.670747,0.304921,0.561307,0.511765,0.380277,0.024615,0.000000,0.162627,0.002687,0.190791,0.658129,0.569543,0.021734,0.000445,0.443558,0.002714,0.389990,0.610614,0.536700,0.025559,0.000000,0.033250,0.000000,1.000000,0.570448,1.000000,0.282956,1.000000,0.155702,0.090909,0.0,0.0,1.0,0.187586,0.809847
5656444,0,1,0.0,0.0,1,1,1,0.144928,0.00,0.000000,0.085646,0.254992,0.477819,0.038394,0.147631,0.670747,0.304921,0.000000,0.344081,0.332308,0.022493,0.051847,0.025902,0.028536,0.000692,0.382417,0.404210,0.022238,0.057288,0.047463,0.011541,0.006914,0.463178,0.299182,0.019878,0.091744,0.019144,0.132347,1.000000,0.537219,0.167963,0.163504,1.000000,0.111493,0.000000,0.0,0.0,1.0,0.372084,0.000000


In [458]:
import xgboost as xgb

model = xgb.XGBClassifier(eval_metric="mlogloss")
model.fit(X_train_filter, y_train_encoded)

# 3. Make predictions
y_pred = model.predict(X_val_filter)  # Assuming X_test is available


# Evaluate the model
accuracy = accuracy_score(y_val_encoded, y_pred)
f1_macro = f1_score(y_val_encoded, y_pred, average='macro')  # Macro-averaged F1 score

# Print individual model results
print(f"XGBClassifier Accuracy: {accuracy:.4f}")
print(f"XGBClassifier Macro F1 Score: {f1_macro:.4f}")
print(classification_report(y_val_encoded, y_pred))  # Detailed report including precision, recall, and F1 score per class
print("-" * 50)

XGBClassifier Accuracy: 0.7404
XGBClassifier Macro F1 Score: 0.4779
              precision    recall  f1-score   support

           0       0.73      0.89      0.80     44551
           1       0.68      0.59      0.63     14484
           2       0.82      0.58      0.68     20672
           3       0.36      0.00      0.01      1263
           4       0.84      0.72      0.77      3741
           5       0.62      0.36      0.46       141
           6       0.00      0.00      0.00        29

    accuracy                           0.74     84881
   macro avg       0.58      0.45      0.48     84881
weighted avg       0.74      0.74      0.73     84881

--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
