## 0. Imports and dataset

In [1]:
import pandas as pd
import numpy as np

from category_encoders import TargetEncoder

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno # for missing values
import sys
sys.path.append('../helper_functions')

# data partition
from sklearn.model_selection import train_test_split

# Import functions that are stored in the helper_functions directory. We do this to keep the notebook clean and easy to read
from helper_functions import *

from sklearn.pipeline import Pipeline

# Import custom_transformer for Incoherences
from incoherences_custom_transformers import (
    IncoCarrierType, 
    IncoWCIOBodyCode, 
    IncoZeroAWW, 
    IncoZeroBirthYEAR, 
    IncoZeroAgeAtInjury, 
    IncoFilterAgeAtInjury, 
    IncoDependents, 
    IncoCorrectAge, 
    IncoSwapAccidentDate, 
    IncoCovidIndicator
)

In [2]:
WCB_original = pd.read_csv('../project_data/train_data.csv', delimiter=',',dtype={'Zip Code': str})
X_test = pd.read_csv('../project_data/test_data.csv', delimiter=',',dtype={'Zip Code': str})

# show all columns
pd.set_option('display.max_columns', None)

WCB_original.head()

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,Carrier Type,Claim Identifier,Claim Injury Type,County of Injury,COVID-19 Indicator,District Name,First Hearing Date,Gender,IME-4 Count,Industry Code,Industry Code Description,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
0,2019-12-30,31.0,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,1A. PRIVATE,5393875,2. NON-COMP,ST. LAWRENCE,N,SYRACUSE,,M,,44.0,RETAIL TRADE,I,,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662.0,0.0,Not Work Related,1.0
1,2019-08-30,46.0,N,2020-01-01,Y,1745.93,1973.0,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,1A. PRIVATE,5393091,4. TEMPORARY,WYOMING,N,ROCHESTER,2020-02-21,F,4.0,23.0,CONSTRUCTION,I,,97.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569.0,1.0,Not Work Related,4.0
2,2019-12-06,40.0,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,1A. PRIVATE,5393889,4. TEMPORARY,ORANGE,N,ALBANY,,M,,56.0,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMEN...,II,,79.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589.0,0.0,Not Work Related,6.0
3,,,,2020-01-01,,,,,,,,957648180,,,,,,,,,,,,,,,,,,,,,
4,2019-12-30,61.0,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,2A. SIF,5393887,2. NON-COMP,DUTCHESS,N,ALBANY,,M,,62.0,HEALTH CARE AND SOCIAL ASSISTANCE,II,,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603.0,0.0,Not Work Related,1.0


## 1. Initial preprocessing


> **1.** Made a copy from the original WCB </br>
> **2.** Drop duplicated Claim Identifier from WCB, no duplicate in X_Test.</br>
> **3.** Set Claim Identifier as Index.</br>
> **4.** Drop all the rows that have null value in Claim Injury Type.</br>
> **5.** Drop OIICS Nature of Injury Description from WCB and X_test.</br>
> **6.** Convert WCB and X_test to datetime.</br>
> **7.** Drop rows in WCB that have duplicates in all columns (<em> Do not remove from X_test</em>).</br>
> **8.** Check for duplicates, excluding 1 column at a time.</br>
> **9.** Drop WCB Decision as it only has unique value (<em>Doesn't exist in X_test</em>).</br>
> **10.** Set unknown values to missing.

In [3]:
# (1) Creating WCB from the original --------------------------------------------
WCB = WCB_original.copy()

# (2) Drop duplicate of Claim Id -------------------------------------------------
WCB = WCB[~WCB['Claim Identifier'].duplicated(keep=False)] #ALTERACAO: mais geral

# (3) Set Claim Indentifier as Index --------------------------------------------
WCB.set_index('Claim Identifier', inplace=True)
X_test.set_index('Claim Identifier', inplace=True)

# (4) Drop null values from Claim Injury Type -----------------------------------
WCB = WCB.dropna(subset=['Claim Injury Type']) #this drops 19445 rows

# (5) Drop column OIICS Nature of Injury Description ----------------------------
WCB = WCB.drop(columns=['OIICS Nature of Injury Description'])
X_test =  X_test.drop(columns=['OIICS Nature of Injury Description'])

# (6) Convert to datetime -------------------------------------------------------
date_columns = ['Accident Date', 'Assembly Date','C-2 Date', 'C-3 Date', 'First Hearing Date']
# Convert columns to datetime
for column in date_columns:
    WCB[column] = pd.to_datetime(WCB[column], format='%Y-%m-%d', errors='coerce')
    X_test[column] = pd.to_datetime(X_test[column], format='%Y-%m-%d', errors='coerce')

# (7) Drop duplicate on all columns ----------------------------------------------
WCB = WCB.drop_duplicates(keep='first')

# (8) Iterate throguh columns, and look for duplicates, excluding 1 column at a time
for col in WCB.columns:
    # Define the subset of columns to check in this iteration (excluding 'col')
    cols_to_check_now = [c for c in  WCB.columns if c != col]

    # Identify duplicates based on these columns
    duplicates = WCB[WCB.duplicated(subset=cols_to_check_now, keep=False)]

    #Drop duplicates, keeping the first occurrence in each subset where one column can differ
    WCB = WCB.drop_duplicates(subset=cols_to_check_now, keep='first')

# (9) Drop WCB Decision as it only has 1 value
WCB = WCB.drop(columns=['WCB Decision'])

# (10) Setting unknown values to nan -------------------------------------------------
unknown_values = {'Alternative Dispute Resolution': 'U',   'Carrier Type': 'UNKNOWN', 'County of Injury': 'UNKNOWN',
    'Gender': 'U','Medical Fee Region': 'UK'}

WCB.replace(unknown_values, np.nan, inplace=True)
X_test.replace(unknown_values, np.nan, inplace=True) # ALTERACAO: adicionei 

In [4]:
wcb = WCB.copy()
test =X_test.copy()

## 2. Incoherences

> **1.** For rows with Carrier Name 'SPECIAL FUNDS SEC 25-A', change Carrier Type to '5A. SPECIAL FUND - CONS. COMM. (SECT. 25-A)'.</br>
> **2.** Change WCIO Part Of Body Code -9 to 90, and associate the description of code 90 to Multiple parts of body (WCB and X_test).</br>
> **3.** Replace AWW==0 with NaN</br>
> **4.** Replace Birth Year==0 with NaN</br>
> **5.** Change the number of dependents of people with age under 16 to zero</br>
> **6.** Age at injury zero to NaN, and drop rows under 14.</br>
> **7.** Age at Injury -1 if is one year above than should be.</br>
> **8.** Invalid entries are the ones where Accident date before Assembly date</br>
> **9.** Turn Covid-19 indicator do 'N' if before 2020-03-01

In [5]:

incoherences_pipeline = Pipeline([
    ('update_carrier_type', IncoCarrierType()),
    ('update_wcio_body_code', IncoWCIOBodyCode()),
    # ('replace_aww_zero_nan', IncoZeroAWW()),
    ('replace_birth_year_zero_nan', IncoZeroBirthYEAR()),
    ('replace_age_zero_nan', IncoZeroAgeAtInjury()),
    ('filter_age_14', IncoFilterAgeAtInjury()),
    ('update_dependents', IncoDependents()),
    ('compare_age_with_accident_and_birth', IncoCorrectAge()),
    ('swap_accident_date', IncoSwapAccidentDate()), 
    ('update_covid_indicator', IncoCovidIndicator())
])

# Apply the pipeline to the training and test data
WCB = incoherences_pipeline.fit_transform(WCB)
X_test = incoherences_pipeline.transform(X_test)


In [7]:
data = WCB.copy()

In [12]:
import h2o
from h2o.automl import H2OAutoML

In [18]:

# Initialize H2O
h2o.init()

# Convert your Pandas DataFrame to an H2O Frame
h2o_data = h2o.H2OFrame(data)

# Target and features
target = "Claim Injury Type"
features = [col for col in h2o_data.columns if col != target]

# Convert the target column to a categorical type in H2O
h2o_data[target] = h2o_data[target].asfactor()

# Proceed with the rest of the code
train, test = h2o_data.split_frame(ratios=[0.8], seed=123)

aml = H2OAutoML(max_runtime_secs=3600, seed=123)
aml.train(x=features, y=target, training_frame=train)

# Leaderboard
lb = aml.leaderboard
print(lb)

# Best model
best_model = aml.leader
performance = best_model.model_performance(test)
print(performance)

Checking whether there is an H2O instance running at http://localhost:54321.

.... not found.
Attempting to start a local H2O server...


H2OStartupError: Cannot find Java. Please install the latest JRE from
http://docs.h2o.ai/h2o/latest-stable/h2o-docs/welcome.html#java-requirements

In [16]:

# Define target and features
target = "Claim Injury Type"
features = [col for col in data.columns if col != target]

# Convert target to categorical (if needed)
data[target] = data[target].asfactor()

# Split data
train, test = data.split_frame(ratios=[0.8], seed=123)

# Run AutoML
aml = H2OAutoML(max_runtime_secs=3600, seed=123)
aml.train(x=features, y=target, training_frame=train)

# Leaderboard
lb = aml.leaderboard
print(lb)

# Best model
best_model = aml.leader
performance = best_model.model_performance(test)
print(performance)

AttributeError: 'Series' object has no attribute 'asfactor'