**Table of contents**<a id='toc0_'></a>    
- [<b>Import Libraries</b>](#toc1_)    
- [<b>1. Load Dataset</b>](#toc2_)    
  - [Load Configuration File](#toc2_1_)    
  - [Data Collection](#toc2_2_)    
- [<b>2. Data Validation </b>](#toc3_)    
  - [Types of Data](#toc3_1_)    
  - [Data Range](#toc3_2_)    
  - [<b>Data Dimension </b>](#toc3_3_)    
- [<b>Handling Columns Type</b>](#toc4_)    
  - [Check Dataset Type](#toc4_1_)    
- [<b>Handling data category</b>](#toc5_)    
- [<b>Data Defense</b>](#toc6_)    
- [<b>Splitting Data</b>](#toc7_)    
  - [<b>Split Train and Test Data (7:3)</b>](#toc7_1_)    
  - [<b>Split Valid and Test Data (1:1)</b>](#toc7_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[<b>Import Libraries</b>](#toc0_)

In [1]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import src.util as util
import joblib
import os
import yaml
import copy

import warnings
warnings.filterwarnings('ignore')

In [2]:
class clr:
    start = '\033[93m' + '\033[1m'
    bold = '\033[1m'
    underline = '\033[4m'
    color = '\033[93m'
    end = '\033[0m'

# <a id='toc2_'></a>[<b>1. Load Dataset</b>](#toc0_)

## <a id='toc2_1_'></a>[Load Configuration File](#toc0_)

In [3]:
config_data = util.load_config()

## <a id='toc2_2_'></a>[Data Collection](#toc0_)

In [4]:
def read_raw_data(config: dict) -> pd.DataFrame:
    # Create variable to store raw dataset
    raw_dataset = pd.DataFrame()

    # Raw Dataset Dir
    raw_dataset_dir = config["raw_dataset_dir"]

    # Look and load add csv files
    for i in tqdm(os.listdir(raw_dataset_dir)):
        raw_dataset = pd.concat([pd.read_csv(raw_dataset_dir + i), raw_dataset])
    
    # Return raw dataset
    return raw_dataset

In [5]:
raw_dataset = read_raw_data(config_data)

100%|██████████| 1/1 [00:00<00:00,  3.29it/s]


In [6]:
# dataset = pd.read_csv('insurance_claims.csv')
pd.set_option('display.max_columns', None)

raw_dataset.head().style.background_gradient(cmap='YlOrBr')

Unnamed: 0,Label1,Label2,DINLevel1ClassCode,ExpenseType,ReceivedDate,MemberIDscrambled,ClaimSubmissionChannel,ClaimantAge,ClaimantGender,FacilityIDscrambled,MemberCity,MemberProvince,PaymentIssueDate,ServiceDate,SubmittedAmount,UniqueClaimCount
0,False,False,D,Drug,01/03/2023,37567,Web,60,F,5182,Deer Lake,NFLD,21/03/2023,01/02/2023,$88.21,1
1,False,False,D,Drug,01/03/2023,37580,Pay Direct Drug,22,F,2758,Airdrie,ALTA,12/02/2023,12/02/2023,$236.91,1
2,False,False,D,Drug,01/03/2023,22027,Pay Direct Drug,39,F,3746,ORLEANS,ONT,01/03/2023,01/03/2023,$12.90,1
3,False,False,D,Drug,01/03/2023,37581,Pay Direct Drug,17,F,331,OTTAWA,ONT,01/03/2023,01/03/2023,$118.13,1
4,False,False,C,Drug,01/03/2023,37582,Pay Direct Drug,82,F,4215,LEDUC,ALTA,01/03/2023,01/03/2023,$126.60,1


In [7]:
# Save raw dataset to file
util.pickle_dump(raw_dataset, config_data["raw_dataset_path"])

# <a id='toc3_'></a>[<b>2. Data Validation </b>](#toc0_)

## <a id='toc3_1_'></a>[Types of Data](#toc0_)

In [8]:
raw_dataset.dtypes

Label1                      bool
Label2                      bool
DINLevel1ClassCode        object
ExpenseType               object
ReceivedDate              object
MemberIDscrambled          int64
ClaimSubmissionChannel    object
ClaimantAge                int64
ClaimantGender            object
FacilityIDscrambled        int64
MemberCity                object
MemberProvince            object
PaymentIssueDate          object
ServiceDate               object
SubmittedAmount           object
UniqueClaimCount           int64
dtype: object

## <a id='toc3_2_'></a>[Data Range](#toc0_)

In [9]:
# some columns have minimum value of zero, this value has been encoded as a missing value 
# instead of None Value or Misig cell.

raw_dataset.describe().style.set_sticky(axis="index").background_gradient()

Unnamed: 0,MemberIDscrambled,ClaimantAge,FacilityIDscrambled,UniqueClaimCount
count,107752.0,107752.0,107752.0,107752.0
mean,32994.165148,59.592982,3248.037568,1.0
std,20973.99096,15.950108,2456.997025,0.0
min,1.0,0.0,1.0,1.0
25%,14112.75,51.0,1136.0,1.0
50%,33504.5,61.0,2792.0,1.0
75%,50970.0,71.0,4890.0,1.0
max,70166.0,109.0,9766.0,1.0


## <a id='toc3_3_'></a>[<b>Data Dimension </b>](#toc0_)

In [10]:
raw_dataset.shape

(107752, 16)

# <a id='toc4_'></a>[<b>Handling Columns</b>](#toc0_)

In [11]:
raw_dataset['ReceivedDate'] = pd.to_datetime(raw_dataset['ReceivedDate'], errors='coerce', dayfirst=True)
# data['ReceivedDate'] = data['ReceivedDate'].dt.strftime('%m/%d/%Y')
raw_dataset['PaymentIssueDate'] = pd.to_datetime(raw_dataset['PaymentIssueDate'], errors='coerce', dayfirst=True)
# data['ReceivedDate'] = data['PaymentIssueDate'].dt.strftime('%m/%d/%Y')
raw_dataset['ServiceDate'] = pd.to_datetime(raw_dataset['ServiceDate'], errors='coerce', dayfirst=True)
# data['ServiceDate'] = data['ServiceDate'].dt.strftime('%m/%d/%Y')

## <a id='toc4_1_'></a>[Check Dataset Type](#toc0_)

In [12]:
## Check dataset type of each features
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107752 entries, 0 to 107751
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   Label1                  107752 non-null  bool          
 1   Label2                  107752 non-null  bool          
 2   DINLevel1ClassCode      107752 non-null  object        
 3   ExpenseType             107752 non-null  object        
 4   ReceivedDate            107752 non-null  datetime64[ns]
 5   MemberIDscrambled       107752 non-null  int64         
 6   ClaimSubmissionChannel  107752 non-null  object        
 7   ClaimantAge             107752 non-null  int64         
 8   ClaimantGender          107752 non-null  object        
 9   FacilityIDscrambled     107752 non-null  int64         
 10  MemberCity              107621 non-null  object        
 11  MemberProvince          107692 non-null  object        
 12  PaymentIssueDate        107752

In [13]:
## Sanity Check
raw_dataset.shape

(107752, 16)

In [14]:
# Check label data
raw_dataset.Label1.value_counts(normalize=True)

Label1
False    0.628601
True     0.371399
Name: proportion, dtype: float64

In [15]:
# Check label data
raw_dataset.Label2.value_counts(normalize=True)

Label2
False    0.936187
True     0.063813
Name: proportion, dtype: float64

In [16]:
## Save final raw dataset
util.pickle_dump(raw_dataset, config_data["cleaned_raw_dataset_path"])

# <a id='toc7_'></a>[<b>Splitting Data</b>](#toc0_)

In [17]:
# Split input/variable/feature with target/labet/output
X = raw_dataset[config_data["predictor"]].copy()
y = raw_dataset[config_data["label"]].copy()

In [18]:
y.value_counts(normalize=True)

Label1  Label2
False   False     0.564788
True    False     0.371399
False   True      0.063813
Name: proportion, dtype: float64

    Data imbalance with 75% and 25%

## <a id='toc7_1_'></a>[<b>Split Train and Test Data (7:3)</b>](#toc0_)

In [19]:
# First split, splitting train and test set with ratio 0.7:0.3 and do stratify splitting
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42, stratify= y)

## <a id='toc7_2_'></a>[<b>Split Valid and Test Data (1:1)</b>](#toc0_)

In [20]:
# Second split, splitting test and valid set with ratio 0.5:0.5 and do stratify splitting
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state= 42, stratify= y_test)

In [21]:
util.pickle_dump(x_train, config_data["train_set_path"][0])
util.pickle_dump(y_train, config_data["train_set_path"][1])

util.pickle_dump(x_valid, config_data["valid_set_path"][0])
util.pickle_dump(y_valid, config_data["valid_set_path"][1])

util.pickle_dump(x_test, config_data["test_set_path"][0])
util.pickle_dump(y_test, config_data["test_set_path"][1])