In [1]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
import xgboost as xgb
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [3]:
# Function to process dates
def process_dates(df):
    date_columns = ['Accident Date', 'Assembly Date', 'C-2 Date', 'C-3 Date', 'First Hearing Date']
    
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            df[f'{col}_Year'] = df[col].dt.year
            df[f'{col}_Month'] = df[col].dt.month
            df[f'{col}_Day'] = df[col].dt.day
            df[f'{col}_DayOfWeek'] = df[col].dt.dayofweek
            
    return df

In [4]:
# Convert categorical columns to string type
def convert_to_string(df, categorical_columns):
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype(str)
    return df

In [5]:
# Process both train and test data
train = process_dates(train)
test = process_dates(test)

In [6]:
# Create feature lists
date_features = ['Accident Date_Year', 'Accident Date_Month', 'Accident Date_Day', 'Accident Date_DayOfWeek',
                'Assembly Date_Year', 'Assembly Date_Month', 'Assembly Date_Day', 'Assembly Date_DayOfWeek',
                'C-2 Date_Year', 'C-2 Date_Month', 'C-2 Date_Day', 'C-2 Date_DayOfWeek',
                'C-3 Date_Year', 'C-3 Date_Month', 'C-3 Date_Day', 'C-3 Date_DayOfWeek',
                'First Hearing Date_Year', 'First Hearing Date_Month', 'First Hearing Date_Day', 'First Hearing Date_DayOfWeek']

numeric_features = ['Age at Injury', 'Average Weekly Wage', 'Birth Year', 'IME-4 Count', 'Number of Dependents']

categorical_features = ['Alternative Dispute Resolution', 'Attorney/Representative','Carrier Type',
                        'County of Injury', 'COVID-19 Indicator', 'District Name', 'Gender',
                       'Industry Code', 'Industry Code Description', 'Medical Fee Region',
                       'OIICS Nature of Injury Description', 'WCIO Cause of Injury Code',
                       'WCIO Cause of Injury Description', 'WCIO Nature of Injury Code',
                       'WCIO Nature of Injury Description', 'WCIO Part Of Body Code',
                       'WCIO Part Of Body Description']

In [7]:
# Convert categorical features to strings
train = convert_to_string(train, categorical_features)
test = convert_to_string(test, categorical_features)

In [8]:
features = date_features + numeric_features + categorical_features

In [9]:
# Prepare the target
le = LabelEncoder()
y = le.fit_transform(train['WCB Decision'])

In [25]:
X = train.drop(['WCB Decision'], axis = 1)
X_train, X_val,y_train, y_val = train_test_split(X,y,
                                                train_size = 0.6,
                                                random_state=0,
                                                shuffle = True, 
                                                stratify = y)

In [26]:
# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

In [27]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        random_state=42,
        n_jobs=-1
    ))
])

In [28]:
model.fit(train[features], y)

In [29]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    
    test_metrics = {
        'accuracy': accuracy_score(y_test, y_test_pred),
        'f1_macro': f1_score(y_test, y_test_pred, average='macro')
    }
    
    print("\nTest Results:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))
    print("\nMetrics:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return test_metrics

test_metrics = evaluate_model(model, X_train, y_train, X_val, y_val)


Test Results:

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    229611
           1       1.00      1.00      1.00      7778

    accuracy                           1.00    237389
   macro avg       1.00      1.00      1.00    237389
weighted avg       1.00      1.00      1.00    237389


Metrics:
accuracy: 1.0000
f1_macro: 1.0000


In [31]:
test_predictions = model.predict(test[features])
test_predictions = le.inverse_transform(test_predictions)
np.savetxt('test_WCB_Decision.csv', test_predictions, delimiter=',', fmt='%s')