<h1 style="text-align: center; color: #4A90E2; font-family: 'Arial', sans-serif; font-size: 36px; text-shadow: 2px 2px #D1D1D1;">
    Model Optimization (MO) for Workers' Compensation Claims
</h1>
<hr style="border: 2px solid #4A90E2;">

<h2 style="text-align: center; color: #4A90E2; font-family: 'Arial', sans-serif; font-size: 36px; text-shadow: 2px 2px #D1D1D1;">Required Imports</h2>

<hr style="border: 2px solid #4A90E2;">

<h3 style="color: #4A90E2; font-family: 'Arial', sans-serif; font-size: 24px; text-shadow: 2px 2px #D1D1D1;">Package Descriptions</h3>
<ul style="font-family: 'Arial', sans-serif;">
    <li><strong>pandas</strong>: For data manipulation and analysis, enabling easy reading and handling of dataframes.</li>
    <li><strong>numpy</strong>: For efficient numerical operations and array manipulation.</li>
    <li><strong>matplotlib.pyplot</strong>: To create data visualizations and plots.</li>
    <li><strong>seaborn</strong>: For generating attractive and informative statistical visualizations.</li>
    <li><strong>missingno</strong>: For visualizing and analyzing missing data, helping to better understand data quality.</li>
</ul>


In [None]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns # type: ignore
import missingno as msng # type: ignore
import sys # type: ignore
import os # type: ignore

from scipy import stats # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from imblearn.over_sampling import SMOTE # type: ignore


sys.path.append(os.path.abspath("../utils"))
from meta_model_train import meta_model_rf, meta_model, meta_model_xgbc, meta_model_et
from neural_network import neural_network
from plots import plot_training_history, plot_confusion_matrix
from predicitons_csv import save_predictions_to_csv
from predictions_fun import load_predictions

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

<h2 style="text-align: center; color: #4A90E2; font-family: 'Arial', sans-serif; font-size: 36px; text-shadow: 1px 1px #D1D1D1;">
    Data Loading
</h2>
<hr style="border: 1px solid #4A90E2;">

<p style="font-size: 18px; line-height: 1.6; font-family: 'Arial', sans-serif;">
    This section handles loading the dataset into the environment for further processing. Using <strong>pandas</strong>, we load the data into a structured dataframe, allowing for easy manipulation, exploration, and analysis throughout the project.
</p>


In [None]:
path = "../data/"

data_train_FS = pd.read_csv(path + "data_train_post_FS.csv")
data_test_FS = pd.read_csv(path + "data_test_post_FS.csv")

combined_data = pd.concat([data_train_FS, data_test_FS], keys=['train', 'test'])

<h2 style="text-align: center; color: #4A90E2; font-family: 'Arial', sans-serif; font-size: 36px; text-shadow: 1px 1px #D1D1D1;">
    Model Selection
</h2>
<hr style="border: 1px solid #4A90E2;">

<p style="font-size: 18px; line-height: 1.6; font-family: 'Arial', sans-serif;">
    This section focuses on selecting the best-performing models for predicting workers' compensation claims outcomes. Various machine learning algorithms are evaluated based on their accuracy, interpretability, and suitability for the dataset, ensuring an optimal balance between predictive performance and computational efficiency.
</p>


In [None]:
x_FS = data_train_FS.drop(['Claim Identifier', 'Claim Injury Type'], axis=1)
y_FS = data_train_FS['Claim Injury Type']

X_train_FS, X_val_FS, y_train_FS, y_val_FS = train_test_split(x_FS, y_FS, test_size=0.25)

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_FS, y_train_FS)

In [None]:
def meta_model_run(model, rapid=True):
    if rapid:
        if model == "CatBoost":
            oof_predictions, test_predictions = load_predictions("../predictions/oof_predictions.npy"), load_predictions("../predictions/test_predictions.npy")
        elif model == "RandomForest":
            oof_predictions, test_predictions = load_predictions("../predictions/oof_predictions_rf.npy"), load_predictions("../predictions/test_predictions_rf.npy")
        elif model == "XGBoost":
            oof_predictions, test_predictions = load_predictions("../predictions/oof_predictions_xgbc.npy"), load_predictions("../predictions/test_predictions_xgbc.npy")
        elif model == "ExtraTree":
            oof_predictions, test_predictions = load_predictions("../predictions/oof_predictions_et.npy"), load_predictions("../predictions/test_predictions_et.npy")
        else:
            oof_predictions, test_predictions = "", ""

        return oof_predictions, test_predictions
    else:
        if model == "CatBoost":
            models, f1_scores, oof_predictions, test_predictions = meta_model(X_train_resampled, y_train_resampled, data_test_FS.drop(columns=['Claim Identifier', 'Claim Injury Type']), 4)
        elif model == "RandomForest":
            models, f1_scores, oof_predictions, test_predictions = meta_model_rf(X_train_resampled, y_train_resampled, data_test_FS.drop(columns=['Claim Identifier', 'Claim Injury Type']), 4)
        elif model == "XGBoost":
            models, f1_scores, oof_predictions, test_predictions = meta_model_xgbc(X_train_resampled, y_train_resampled, data_test_FS.drop(columns=['Claim Identifier', 'Claim Injury Type']), 4)
        elif model == "ExtraTree":
            models, f1_scores, oof_predictions, test_predictions = meta_model_et(X_train_resampled, y_train_resampled, data_test_FS.drop(columns=['Claim Identifier', 'Claim Injury Type']), 4)
        else:
            oof_predictions, test_predictions = "", ""
        
        return oof_predictions, test_predictions

<h3 style="color: #4A90E2; font-family: 'Arial', sans-serif; font-size: 28px; text-shadow: 2px 2px #D1D1D1;">CatBoost Classifier</h3>

In [None]:
oof_predictions, test_predictions = meta_model_run("CatBoost", rapid=True)

In [None]:
nn_model, X_nn_train, X_nn_val, y_nn_val, y_nn_train, early_stopping, reduce_lr = neural_network(oof_predictions, y_train_resampled)

history = nn_model.fit(
    X_nn_train, y_nn_train,
    validation_data=(X_nn_val, y_nn_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

In [None]:
plot_training_history(history)

In [None]:
plot_confusion_matrix(model=nn_model, X_val=X_nn_val, y_val=y_nn_val,
    class_mapping={
        0: '2. NON-COMP',
        1: '4. TEMPORARY',
        2: '3. MED ONLY',
        3: '5. PPD SCH LOSS',
        4: '6. PPD NSL',
        5: '1. CANCELLED',
        6: '8. DEATH',
        7: '7. PTD'
    }, title="Matriz de Confusão - Conjunto de Validação"
)

In [None]:
save_predictions_to_csv(
    model=nn_model,
    test_data=test_predictions,
    claim_ids=data_test_FS["Claim Identifier"],
    class_mapping={
        0: '2. NON-COMP',
        1: '4. TEMPORARY',
        2: '3. MED ONLY',
        3: '5. PPD SCH LOSS',
        4: '6. PPD NSL',
        5: '1. CANCELLED',
        6: '8. DEATH',
        7: '7. PTD'
    },
    output_path="../predictions/group_40_KFold_CatBoost_NN_predictions.csv"
)


In [None]:
predicitons_data = pd.read_csv('../predictions/group_40_KFold_CatBoost_NN_predictions.csv')
values = predicitons_data['Claim Injury Type'].value_counts()
values

<h3 style="color: #4A90E2; font-family: 'Arial', sans-serif; font-size: 28px; text-shadow: 2px 2px #D1D1D1;">Random Forest Classifier</h3>

In [None]:
oof_predictions_rf, test_predictions_rf = meta_model_run("RandomForest",rapid=False)

In [None]:
nn_model, X_nn_train, X_nn_val, y_nn_val, y_nn_train, early_stopping, reduce_lr = neural_network(oof_predictions_rf, y_train_resampled)

history = nn_model.fit(
    X_nn_train, y_nn_train,
    validation_data=(X_nn_val, y_nn_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

In [None]:
plot_training_history(history)

In [None]:
plot_confusion_matrix(model=nn_model, X_val=X_nn_val, y_val=y_nn_val,
    class_mapping={
        0: '2. NON-COMP',
        1: '4. TEMPORARY',
        2: '3. MED ONLY',
        3: '5. PPD SCH LOSS',
        4: '6. PPD NSL',
        5: '1. CANCELLED',
        6: '8. DEATH',
        7: '7. PTD'
    }, title="Matriz de Confusão - Conjunto de Validação"
)

In [None]:
save_predictions_to_csv(
    model=nn_model,
    test_data=test_predictions_rf,
    claim_ids=data_test_FS["Claim Identifier"],
    class_mapping={
        0: '2. NON-COMP',
        1: '4. TEMPORARY',
        2: '3. MED ONLY',
        3: '5. PPD SCH LOSS',
        4: '6. PPD NSL',
        5: '1. CANCELLED',
        6: '8. DEATH',
        7: '7. PTD'
    },
    output_path="../predictions/group_40_KFold_RF_NN_predictions.csv"
)


In [None]:
predicitons_data = pd.read_csv('../predictions/group_40_KFold_RF_NN_predictions.csv')
values = predicitons_data['Claim Injury Type'].value_counts()
values

<h3 style="color: #4A90E2; font-family: 'Arial', sans-serif; font-size: 28px; text-shadow: 2px 2px #D1D1D1;">XGBoost Classifier</h3>

In [None]:
oof_predictions_xgbc, test_predictions_xgbc = meta_model_run("XGBoost",rapid=False)

In [None]:
nn_model, X_nn_train, X_nn_val, y_nn_val, y_nn_train, early_stopping, reduce_lr = neural_network(oof_predictions_xgbc, y_train_resampled)

history = nn_model.fit(
    X_nn_train, y_nn_train,
    validation_data=(X_nn_val, y_nn_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

In [None]:
plot_training_history(history)

In [None]:
plot_confusion_matrix(model=nn_model, X_val=X_nn_val, y_val=y_nn_val,
    class_mapping={
        0: '2. NON-COMP',
        1: '4. TEMPORARY',
        2: '3. MED ONLY',
        3: '5. PPD SCH LOSS',
        4: '6. PPD NSL',
        5: '1. CANCELLED',
        6: '8. DEATH',
        7: '7. PTD'
    }, title="Matriz de Confusão - Conjunto de Validação"
)

In [None]:
save_predictions_to_csv(
    model=nn_model,
    test_data=test_predictions_xgbc,
    claim_ids=data_test_FS["Claim Identifier"],
    class_mapping={
        0: '2. NON-COMP',
        1: '4. TEMPORARY',
        2: '3. MED ONLY',
        3: '5. PPD SCH LOSS',
        4: '6. PPD NSL',
        5: '1. CANCELLED',
        6: '8. DEATH',
        7: '7. PTD'
    },
    output_path="../predictions/group_40_KFold_XGBC_NN_predictions.csv"
)


In [None]:
predicitons_data = pd.read_csv('../predictions/group_40_KFold_XGBC_NN_predictions.csv')
values = predicitons_data['Claim Injury Type'].value_counts()
values

<h3 style="color: #4A90E2; font-family: 'Arial', sans-serif; font-size: 28px; text-shadow: 2px 2px #D1D1D1;">Extra Tree Classifier</h3>

In [None]:
oof_predictions_et, test_predictions_et = meta_model_run("ExtraTree",rapid=False)

In [None]:
nn_model, X_nn_train, X_nn_val, y_nn_val, y_nn_train, early_stopping, reduce_lr = neural_network(oof_predictions_et, y_train_resampled)

history = nn_model.fit(
    X_nn_train, y_nn_train,
    validation_data=(X_nn_val, y_nn_val),
    epochs=10,
    batch_size=64,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

In [None]:
plot_training_history(history)

In [None]:
plot_confusion_matrix(model=nn_model, X_val=X_nn_val, y_val=y_nn_val,
    class_mapping={
        0: '2. NON-COMP',
        1: '4. TEMPORARY',
        2: '3. MED ONLY',
        3: '5. PPD SCH LOSS',
        4: '6. PPD NSL',
        5: '1. CANCELLED',
        6: '8. DEATH',
        7: '7. PTD'
    }, title="Matriz de Confusão - Conjunto de Validação"
)

In [None]:
save_predictions_to_csv(
    model=nn_model,
    test_data=test_predictions_et,
    claim_ids=data_test_FS["Claim Identifier"],
    class_mapping={
        0: '2. NON-COMP',
        1: '4. TEMPORARY',
        2: '3. MED ONLY',
        3: '5. PPD SCH LOSS',
        4: '6. PPD NSL',
        5: '1. CANCELLED',
        6: '8. DEATH',
        7: '7. PTD'
    },
    output_path="../predictions/group_40_KFold_ExtraTree_NN_predictions.csv"
)


In [None]:
predicitons_data = pd.read_csv('../predictions/group_40_KFold_ExtraTree_NN_predictions.csv')
values = predicitons_data['Claim Injury Type'].value_counts()
values