In [114]:
# Remove low-count categories
df = df.loc[(df['MemberCity'] != 'None') &
            (df['MemberCity'] != 'Unknown') &
            (df['MemberCity'] != 'unk') &
            (df['MemberCity'] != 'None') &
            (df['MemberCity'] != 'Unknown')]

In [115]:
# Remove low-count categories
df = df.loc[(df['MemberProvince'] != 'None') &
            (df['MemberProvince'] != 'Unknown') &
            (df['MemberProvince'] != 'unk') &
            (df['MemberProvince'] != 'None') &
            (df['MemberProvince'] != 'Unknown')]

In [116]:
# Remove low-count categories
df = df.loc[(df['ClaimantAge'] != 'None') &
            (df['ClaimantAge'] != 'Unknown') &
            (df['ClaimantAge'] != 'unk') &
            (df['ClaimantAge'] != 'None') &
            (df['ClaimantAge'] != 'Unknown')]

## <a id='toc2_2_'></a>[Describe Data](#toc0_)

### <a id='toc1_1_2_'></a>[Load Dataset](#toc0_)

In [117]:
def load_dataset(config_data: dict) -> pd.DataFrame:
    # Load set of data
    x_train = util.pickle_load(config_data["train_set_path"][0])
    y_train = util.pickle_load(config_data["train_set_path"][1])

    x_valid = util.pickle_load(config_data["valid_set_path"][0])
    y_valid = util.pickle_load(config_data["valid_set_path"][1])

    x_test = util.pickle_load(config_data["test_set_path"][0])
    y_test = util.pickle_load(config_data["test_set_path"][1])

    # concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # return 3 set of data
    return train_set, valid_set, test_set

In [118]:
train_set, valid_set, test_set = load_dataset(config_data)

In [119]:
train_set, valid_set, test_set = train_set.reset_index(drop=True),  valid_set.reset_index(drop=True),test_set.reset_index(drop=True)

In [120]:
# DATA PREPROCESSING


# Divide the dataframe into independent and dependent variables
X = df.drop(columns=["Label1", "Label2"], axis=1)
y = df[["Label1", "Label2"]].fillna(0)



In [121]:
# Split the dataset into train and test sets
X_train, y_train, X_test, y_test = iterative_train_test_split(X.values, y.values, test_size = 0.2)

In [122]:
# Extract columns for X and y variables
X_cols = df.drop(columns=["Label1", "Label2"]).columns
y_cols = df[["Label1", "Label2"]].columns

In [123]:
# Function to convert X variable array to data frame
def arr_to_dfx(arr):
    return (pd.DataFrame(data=arr[0:,0:], index=[i for i in range(arr.shape[0])], columns=[X_cols[i] for i in range(len(X_cols))]))

In [124]:
# Function to convert y variable array to data frame
def arr_to_dfy(arr):
    return (pd.DataFrame(data=arr[0:,0:], index=[i for i in range(arr.shape[0])], columns=[y_cols[i] for i in range(len(y_cols))]))

In [125]:
# Convert arrays to data frames for each variable
X_train = arr_to_dfx(X_train)
y_train = arr_to_dfy(y_train)

X_test = arr_to_dfx(X_test)
y_test = arr_to_dfy(y_test)

### One-hot encoding

Apply one-hot encoder on categorical features. The parameter handle_unknown="ignore" ensures that the model is robust during production as it ignores labels not seen in train set that are present in test set, thus avoiding any interruption during execution.

In [126]:
# Use One-Hot Encoding to perform fitting for categorical features in the train data
ohe = OneHotEncoder(handle_unknown="ignore") # Ensure consistent results on unseen data
ohe.fit(X_train[['DINLevel1ClassCode','ClaimSubmissionChannel','ClaimantGender','MemberProvince']])

### Data cleaning

Clean the datasets and apply one-hot encoder to train and test sets.

In [127]:

def clean_and_encode_column(column_series):
    """
    Clean and encode a specific column.

    """
    # Clean formatting issues
    cleaned_column = column_series.str.lower()  # Convert to lowercase
    cleaned_column = cleaned_column.str.replace(f'[{string.punctuation}0-9]', '', regex=True)  # Remove special characters and numbers
    cleaned_column = cleaned_column.str.strip()  # Remove leading/trailing spaces

    # Calculate frequency of each value
    freq = cleaned_column.value_counts() / len(cleaned_column)

    # Replace original values with frequencies
    encoded_column = cleaned_column.map(freq)

    return encoded_column

In [128]:
clean_MemberCity_train = clean_and_encode_column(X_train['MemberCity'])
clean_MemberCity_test = clean_and_encode_column(X_test['MemberCity'])

In [129]:
clean_MemberCity_train_enc = clean_MemberCity_train.to_frame(name='Clean_MemberCity')
clean_MemberCity_test_enc = clean_MemberCity_test.to_frame(name = 'Clean_MemberCity')

In [130]:
X_train = pd.concat([X_train, clean_MemberCity_train_enc], axis=1)
X_test = pd.concat([X_test, clean_MemberCity_test_enc], axis=1)

In [131]:
# Create a function for pre-processing a dataframe
def preprocess(dfr):
    
    dfr[['ReceivedDate','PaymentIssueDate', 'ServiceDate']] = dfr[['ReceivedDate','PaymentIssueDate', 'ServiceDate']].apply(pd.to_datetime, format="%m/%d/%Y", errors="coerce")
 # Convert the dates to datetime format

    
    #submitted amout columns

    submitted_amount_str = dfr['SubmittedAmount'].astype(str)
    submitted_amount_str_no_quotes = submitted_amount_str.str.strip('"')  # Remove double quotes
    submitted_amount_numeric = pd.to_numeric(submitted_amount_str_no_quotes.str.replace('[\$,()]', ''), errors='coerce')
    submitted_amount_numeric.fillna(0, inplace=True)
    dfr['SubmittedAmountNumeric'] = submitted_amount_numeric

    # submitted_amount_str = dfr['SubmittedAmount'].astype(str)
    # submitted_amount_numeric = pd.to_numeric(submitted_amount_str.str.replace('[\$,()]', ''), errors='coerce')
    # submitted_amount_numeric.fillna(0, inplace=True)
    # dfr['SubmittedAmountNumeric'] = submitted_amount_numeric

    #new columns
    # Create new features from date format columns
    dfr['ReceivedYear'] = dfr['ReceivedDate'].dt.year
    dfr['ReceivedMonth'] = dfr['ReceivedDate'].dt.month
    dfr['ReceivedDayOfWeek'] = dfr['ReceivedDate'].dt.dayofweek
    dfr['PaymentYear'] = dfr['PaymentIssueDate'].dt.year
    dfr['PaymentMonth'] = dfr['PaymentIssueDate'].dt.month
    dfr['PaymentDayOfWeek'] = dfr['PaymentIssueDate'].dt.dayofweek
    dfr['ServiceYear'] = dfr['ServiceDate'].dt.year 
    dfr['ServiceMonth'] = dfr['ServiceDate'].dt.month
    dfr['ServiceDayOfWeek'] = dfr['ServiceDate'].dt.dayofweek

    # Calculate 'member_claims_count' and 'member_avg_amount'
    dfr['member_claims_count'] = dfr.groupby('MemberCity')['SubmittedAmountNumeric'].transform('count')
    dfr['member_avg_amount'] = dfr.groupby('MemberCity')['SubmittedAmountNumeric'].transform('mean')

    # Calculate 'member_claims_count' and 'member_avg_amount'
    dfr['member_claims_count_by_province'] = dfr.groupby('MemberProvince')['SubmittedAmountNumeric'].transform('count')
    dfr['member_avg_amount_by_province'] = dfr.groupby('MemberProvince')['SubmittedAmountNumeric'].transform('mean')

    # Create new features that represent the ratio of 'SubmittedAmount' to 'member_claims_count'
    dfr['AmountToClaimsCountRatio'] = dfr['SubmittedAmountNumeric'] / dfr['member_claims_count']
    
    subset = dfr[['DINLevel1ClassCode','ClaimSubmissionChannel','ClaimantGender','MemberProvince']] # Create a subset of relevant categorical columns
    feature_arr = ohe.transform(subset).toarray() # Transform the dataframe using one-hot encoding
    feature_labels = ohe.get_feature_names_out() # Extract the encoded feature labels

    features = pd.DataFrame(feature_arr, columns=feature_labels, index=subset.index) # Create a dataframe for feature array & labels post-encoding


    dfr = pd.concat([dfr,features], axis=1) # Concat the (now) feature dataframe with the argument dataframe

    drop_columns = ['DINLevel1ClassCode','ExpenseType','ClaimSubmissionChannel', 'ClaimantAge', 'ClaimantGender', \
                   'MemberProvince', 'SubmittedAmount', ]

    dfr = dfr.drop(columns= drop_columns) # Drop redundant columns
    
    return dfr # Return the pre-processed dataframe

In [132]:
# Encode and pre-process the train set
'MemberProvince_nan'
X_train = preprocess(X_train).drop(columns=['MemberProvince_nan'])
X_train.head()

Unnamed: 0,ReceivedDate,MemberIDscrambled,FacilityIDscrambled,MemberCity,PaymentIssueDate,ServiceDate,UniqueClaimCount,Clean_MemberCity,SubmittedAmountNumeric,ReceivedYear,ReceivedMonth,ReceivedDayOfWeek,PaymentYear,PaymentMonth,PaymentDayOfWeek,ServiceYear,ServiceMonth,ServiceDayOfWeek,member_claims_count,member_avg_amount,member_claims_count_by_province,member_avg_amount_by_province,AmountToClaimsCountRatio,DINLevel1ClassCode_A,DINLevel1ClassCode_B,DINLevel1ClassCode_C,DINLevel1ClassCode_D,ClaimSubmissionChannel_Mobile,ClaimSubmissionChannel_Other Unclassified,ClaimSubmissionChannel_Paper,ClaimSubmissionChannel_Pay Direct Drug,ClaimSubmissionChannel_Web,ClaimantGender_F,ClaimantGender_M,ClaimantGender_U,MemberProvince_AL,MemberProvince_ALTA,MemberProvince_BC,MemberProvince_DC,MemberProvince_FL,MemberProvince_MAN,MemberProvince_NB,MemberProvince_NFLD,MemberProvince_NS,MemberProvince_NU,MemberProvince_NWT,MemberProvince_ONT,MemberProvince_PEI,MemberProvince_QUE,MemberProvince_SASK,MemberProvince_TX,MemberProvince_YUK
0,2023-01-03,37567,5182,Deer Lake,NaT,2023-01-02,1,0.000162,0.0,2023.0,1.0,1.0,,,,2023.0,1.0,0.0,2.0,0.0,1428.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-01-03,37580,2758,Airdrie,2023-12-02,2023-12-02,1,0.002216,0.0,2023.0,1.0,1.0,2023.0,12.0,5.0,2023.0,12.0,5.0,56.0,0.0,9242.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-01-03,22027,3746,ORLEANS,2023-01-03,2023-01-03,1,0.008481,0.0,2023.0,1.0,1.0,2023.0,1.0,1.0,2023.0,1.0,1.0,709.0,0.0,38508.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2023-01-03,37581,331,OTTAWA,2023-01-03,2023-01-03,1,0.036603,0.0,2023.0,1.0,1.0,2023.0,1.0,1.0,2023.0,1.0,1.0,2942.0,0.0,38508.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2023-01-03,37582,4215,LEDUC,2023-01-03,2023-01-03,1,0.000951,0.0,2023.0,1.0,1.0,2023.0,1.0,1.0,2023.0,1.0,1.0,50.0,0.0,9242.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
# Encode and pre-process the test set
X_test = preprocess(X_test).drop(columns=['MemberProvince_nan'])

In [134]:
columns_to_drop = ['ReceivedDate', 'MemberIDscrambled', 'FacilityIDscrambled',
       'MemberCity', 'PaymentIssueDate', 'ServiceDate', 'UniqueClaimCount',
       'Clean_MemberCity', 'SubmittedAmountNumeric']


In [135]:
X_train.drop(columns=columns_to_drop, axis=1, inplace=True)
X_test.drop(columns=columns_to_drop, axis = 1, inplace=True)

This procedure efficiently handled NaNs without having to remove all of them individually. Remove null values from the remaining columns.

In [136]:
# Check for null value counts again
X_train.isna().sum()

ReceivedYear                                 27917
ReceivedMonth                                27917
ReceivedDayOfWeek                            27917
PaymentYear                                  29682
PaymentMonth                                 29682
PaymentDayOfWeek                             29682
ServiceYear                                  30097
ServiceMonth                                 30097
ServiceDayOfWeek                             30097
member_claims_count                            107
member_avg_amount                              107
member_claims_count_by_province                 51
member_avg_amount_by_province                   51
AmountToClaimsCountRatio                       107
DINLevel1ClassCode_A                             0
DINLevel1ClassCode_B                             0
DINLevel1ClassCode_C                             0
DINLevel1ClassCode_D                             0
ClaimSubmissionChannel_Mobile                    0
ClaimSubmissionChannel_Other Un

# TRANSFORM BOOLEAN VALUES

In [137]:
y_train[['Label1', 'Label2']] = y_train[['Label1', 'Label2']].replace({True: 1, False: 0})
y_test[['Label1', 'Label2']] = y_test[['Label1', 'Label2']].replace({True: 1, False: 0})

In [138]:
# Drop NAs after join X and y variables for train and test sets separately
train_joined = X_train.join(y_train).dropna()
test_joined = X_test.join(y_test).dropna()

In [139]:
# Split into X and y variables for train and test sets again
X_train = train_joined.iloc[:, :-2]
y_train = train_joined.iloc[:, -2:]

X_test = test_joined.iloc[:, :-2]
y_test = test_joined.iloc[:, -2:]

## Model development

### DecisionTreeClassifier along with OneVsRestClassifier for multi-label classification model

Use `DecisionTreeClassifier()` for modelling.

### Hyper-parameter tuning
Use `RandomizedSearchCV()` with a grid of parameters to find the optimal hyper-parameters for the model. The parameter error_score=0 ensures that fitting during cross-validation is uninterrupted.

In [140]:
# Instantiate Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state=42)

# Define a grid of parameters
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [*range(2, 10, 1)],
    'max_leaf_nodes': [*range(1, 100, 2)],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Use Randomized Search with 5 fold cross-validation to find optimal hyper-parameters
CV_model_dt = RandomizedSearchCV(estimator=dtc, param_distributions=param_grid, cv=5, error_score=0, verbose=1, random_state=42)
CV_model_dt.fit(X_train, y_train)

# Print the optimal hyperparameters after randomized search cross-validation
print(CV_model_dt.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'splitter': 'best', 'max_leaf_nodes': 61, 'max_features': 'log2', 'max_depth': 4, 'criterion': 'gini'}


In [141]:
# Display the fits/results from cross validation
pd.DataFrame.from_dict(CV_model_dt.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_splitter,param_max_leaf_nodes,param_max_features,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.028001,0.002969,0.006808,0.000735,best,61,log2,4,gini,"{'splitter': 'best', 'max_leaf_nodes': 61, 'ma...",1.0,1.0,1.0,1.0,0.168523,0.833705,0.332591,1
1,0.025394,0.007349,0.005996,0.0011,best,73,sqrt,6,entropy,"{'splitter': 'best', 'max_leaf_nodes': 73, 'ma...",1.0,1.0,1.0,1.0,0.168523,0.833705,0.332591,1
2,0.007819,0.001297,0.0,0.0,best,93,auto,4,entropy,"{'splitter': 'best', 'max_leaf_nodes': 93, 'ma...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
3,0.020602,0.003047,0.005409,0.000501,best,67,sqrt,3,gini,"{'splitter': 'best', 'max_leaf_nodes': 67, 'ma...",1.0,1.0,1.0,1.0,0.168523,0.833705,0.332591,1
4,0.022209,0.003137,0.005388,0.000501,best,27,log2,8,entropy,"{'splitter': 'best', 'max_leaf_nodes': 27, 'ma...",1.0,1.0,1.0,1.0,0.168523,0.833705,0.332591,1
5,0.021008,0.002279,0.005797,0.000401,best,45,sqrt,5,entropy,"{'splitter': 'best', 'max_leaf_nodes': 45, 'ma...",1.0,1.0,1.0,1.0,0.168523,0.833705,0.332591,1
6,0.021426,0.002812,0.005381,0.000493,random,71,sqrt,4,entropy,"{'splitter': 'random', 'max_leaf_nodes': 71, '...",1.0,1.0,1.0,0.969329,0.168523,0.82757,0.329738,8
7,0.018398,0.002423,0.005194,0.000405,random,19,log2,3,entropy,"{'splitter': 'random', 'max_leaf_nodes': 19, '...",0.994249,0.992332,0.993884,0.992241,0.168523,0.828246,0.329862,7
8,0.021001,0.002531,0.005989,0.000668,best,31,sqrt,2,gini,"{'splitter': 'best', 'max_leaf_nodes': 31, 'ma...",0.991511,0.986125,0.989868,0.988407,0.168523,0.824887,0.328187,9
9,0.019707,0.001783,0.005391,0.000477,random,85,sqrt,7,gini,"{'splitter': 'random', 'max_leaf_nodes': 85, '...",1.0,1.0,1.0,0.999452,0.168523,0.833595,0.332536,6


In [142]:
# Print the best estimator that gave the highest score/minimum loss
CV_model_dt.best_estimator_

In [143]:
# Define and fit the model with the optimal parameters
clf = CV_model_dt.best_estimator_
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

### Feature importance

Display top 10 features for the classifier.

In [144]:
plt.rcParams["figure.figsize"] = (22,12)
feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')

<Axes: >

Display feature importance for all the features. More than half the features seem to not have contributed anything.

In [145]:
feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)
feat_importances.nlargest(30).plot(kind='barh')

<Axes: >

## Metric evaluation

**Zero-one loss**

Zero-one loss returns the fraction of misclassifications. The best performance is 0. In multilabel classification, the `zero_one_loss()` function corresponds to the subset zero-one loss: for each sample, the entire set of labels must be correctly predicted, otherwise the loss for that sample is equal to one.

In [146]:
# Compute the zero-one classification loss
zero_one_loss(y_test, y_pred)

0.0008832188420019316

**Hamming loss**

The Hamming loss is the fraction of labels that are incorrectly predicted. In multilabel classification, the Hamming loss is different from the subset zero-one loss. The zero-one loss considers the entire set of labels for a given sample incorrect if it does not entirely match the true set of labels. Hamming loss is more forgiving in that it penalizes only the individual labels. It is always between 0 and 1, lower being better.

In [147]:
# Compute the average Hamming loss
hamming_loss(y_test, y_pred)

0.00044160942100098134

**Exact Match Ratio and Hamming Loss for each segment using OneVsRestClassifier as a wrapper class**

With the use of model pipeline, decompose this multi-label task into multiple independent binary classification problems (one per category). Using "one vs rest" strategy, build multiple independent classifiers and, for each unseen instance, display the desired metrics.

In [148]:
# Using pipeline to apply decision tree classifier with one vs rest classifier as a wrapper
dt_pipeline = Pipeline([('clf', OneVsRestClassifier(clf, n_jobs=-1))])

# Loop over each segment
for category in y_train.columns:
    print('{}:'.format(category))
    
    # Fit the model to the data
    dt_pipeline.fit(X_train, y_train[category])
    
    # Calculate and display metrics
    dt_pred = dt_pipeline.predict(X_test)

    print('Exact Match Ratio: {}'.format(accuracy_score(y_test[category], dt_pred)))
    print('Hamming Loss: {}\n'.format(hamming_loss(y_test[category], dt_pred)))

Label1:
Exact Match Ratio: 1.0
Hamming Loss: 0.0

Label2:
Exact Match Ratio: 0.9991167811579981
Hamming Loss: 0.0008832188420019627



### Overfitting can be countered by Pruning the tree: This means cutting off some branches of the tree that are too complex or too specific. Pruning can be done either before or after the tree is fully grown, by using different criteria such as the minimum number of samples in a node, the maximum depth of the tree, or the information gain of a split. Pruning can reduce the variance of the model and improve its performance on the test data
### Setting a minimum sample split: This means setting a minimum number of samples that are required to split a node. This can prevent the tree from growing too deep and creating nodes that are based on very few samples. Setting a minimum sample split can also reduce the computational cost of the model and avoid overfitting
### Setting a minimum leaf samples threshold: This means setting a minimum number of samples that are required to be in a leaf node. This can prevent the tree from creating leaves that are too pure or too specific

### Confusion matrix

Display a multi-label confusion matrix as the model seems to suffer from overfitting when the metrics are computed individually for each segment.

### Use Multi-Output Classifier wrapper with Decision Tree Classifier to get predictions

In [149]:
# Use Multi-Output Classifier wrapper with Decision Tree Classifier to get predictions
clfr = MultiOutputClassifier(dtc)
clfr.fit(X_train, y_train)
pred = clfr.predict(X_test)

# Compute and display the multi-label confusion matrix
multilabel_confusion_matrix(y_test, pred)

array([[[12812,     0],
        [    0,  7568]],

       [[19066,     0],
        [    0,  1314]]], dtype=int64)

In [150]:
# Use One vs Rest Classifier wrapper with Decision Tree Classifier to get predicted probabilities for each segment
clf_prob = OneVsRestClassifier(clf)
clf_prob.fit(X_train, y_train)
prob = clf_prob.predict_proba(X_test)
prob

array([[0.00000000e+00, 1.99075405e-04],
       [0.00000000e+00, 1.99075405e-04],
       [0.00000000e+00, 1.99075405e-04],
       ...,
       [0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00]])