In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **PROBLEM STATEMENT**

We are given the following:

1. A train dataset (.csv) containing the index column (0 to n_train_examples-1), features ('f1' to 'f118') and the ground truth *claim* (0 or 1) respectively.
2. A test dataset (.csv) containing the index column (0 to n_test_examples-1), features ('f1' to 'f118') respectively.

We are required to implement a binary-classification algorithm which predicts for each example of the test dataset, whether a customer made a claim upon an insurance policy. A '1' value means a claim was made, and '0' means a claim was not made.

# **IMPORT LIBRARIES**

In [None]:
import numpy as np
import pandas as pd
import datatable as dt
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

# **0. DATASET**

**0.0 LOADING DATASET**

In [None]:
%%time

train_filename = "../input/tabular-playground-series-sep-2021/train.csv"
test_filename = "../input/tabular-playground-series-sep-2021/test.csv"

train_orig = dt.fread(train_filename).to_pandas()
test_orig = dt.fread(test_filename).to_pandas()

train_orig = train_orig.set_index('id')
test_orig = test_orig.set_index('id')

**0.1 DATASET OVERVIEW**

In [None]:
train_orig.shape

There are a total of *957919* training examples, having *118* features ranging from 'f1' to 'f118', and *1* target column, i.e. *claim* which corresponds to - whether the claim was made (1) or not (0).

In [None]:
train_orig.info()
print()
test_orig.info()

All the features in the dataset are of type *float64*, and the ground truth column, i.e. *claim* is of type *int64*.

In [None]:
pd.set_option('display.max_columns', 120)
train_orig.describe()

As expected, the dataset is far from standard with some features taking exponentially large values while some other taking exponentially small values. Also, most features seem to be having missing values, so we will have to take care of these things at a later point.

In [None]:
train_orig.head(10)

**0.2 MEMORY REDUCTION**

For a large dataset such as this one, one might often face situations where the system runs out of RAM. Thus, it would be wise to cut down on the memory usage.

In [None]:
train_memory_orig = train_orig.memory_usage().sum() / 1024**2
print('Memory usage of original training set(in MB): {}'.format(train_memory_orig))

def reduce_memory(df):
    for col in df.columns:
        if str(df[col].dtypes)[:5] == 'float':
            low = df[col].min()
            high = df[col].max()
            if((low > np.finfo(np.float16).min) and (high < np.finfo(np.float16).max)):
                df[col] = df[col].astype('float16')
            elif((low > np.finfo(np.float32).min) and (high < np.finfo(np.float).max)):
                df[col] = df[col].astype('float32')
    return df

reduce_memory(train_orig)
train_memory_reduced = train_orig.memory_usage().sum() / 1024**2
print('Memory usage of reduced training set(in MB): {}'.format(train_memory_reduced))

In [None]:
test_memory_orig = test_orig.memory_usage().sum() / 1024**2
print('Memory usage of original test set(in MB): {}'.format(test_memory_orig))

reduce_memory(test_orig)
test_memory_reduced = test_orig.memory_usage().sum() / 1024**2
print('Memory usage of reduced test set(in MB): {}'.format(test_memory_reduced))

# **1. EXPLORATORY DATA ANALYSIS**

**1.0 DUPLICATE REMOVAL**

First off, there is always a possibility that our dataset is having duplicate entries. This is typically a fault of the data acquisition step.

In [None]:
#prints the number of duplicated entries(rows)
n_duplicates = train_orig.duplicated().sum()
print("Number of duplicated entries: {}".format(n_duplicates))

This means that our dataset has only unique entries. Having ensured this, now we can proceed to the actual EDA for our dataset.

**1.1 TARGET COLUMN**

Now, let's look at how the target column *claim* is distributed throughout the dataset.

In [None]:
claim_dist = train_orig.claim.value_counts()
display(claim_dist)

Seems pretty well balanced. Let's confirm this notion through a pie chart (because "visual data is always more convincing").

In [None]:
plt.figure(figsize = (10,6))
claim_dist.plot.pie(autopct = '%.1f', colors = ['powderblue', 'slateblue'])
plt.title("Claim vlaue distribution pie chart", pad = 20, fontdict = {'size' : 15, 'color' : 'darkblue', 'weight' : 'bold'})
plt.show()

>"Perfectly balanced, as all things should be."

**1.2 FEATURE ENGINEERING**

Generally, when we talk about feature engineering, we mean combining the existing features (or engineering new features from them for that matter). However, for this dataset, we have no knowledge about what the features are, neither their impact on the target feature. So, my opinion is that engineering new features won't help us much for this dataset.

Furthermore, there is no need to classify features into different data-types (this helps later while processing the dataset) as all features are of type *float* only.

**1.3 DISTRIBUTION ANALYSIS**

Let's see how the features are distributed w.r.t. the target variable.

**NOTE:** Since we have a very large dataset, we will plot these distributions taking a small sample from the dataset. For better estimations, we will take a random sample, preferably of fraction 1/100 of the original dataset. This will help in faster generation of plots.

In [None]:
train_frac = train_orig.sample(frac = 0.01).reset_index(drop = True)
#train_frac = train_orig[0:9579]
target = train_frac.claim
#txt = "Kernel Density Estimation Plots w.r.t. the target 'claim' for {} training examples".format(train_frac.shape[0]).center(110)
#print(txt)

c = 4
#r_ = int(np.ceil(len(train_frac.columns)/4))
r = int(np.ceil(train_frac.shape[1]/4))
#print(r, r_)
fig, ax = plt.subplots(nrows = r, ncols = c, figsize = (25,80))
i = 1
for col in train_frac.columns:
    plt.subplot(r, c, i)
    ax = sns.kdeplot(train_frac[col], hue = target, fill = True, multiple = 'stack')
    plt.xlabel(col, fontsize = 15)
    i = i + 1
    
fig.tight_layout(pad = 2.0)
fig.subplots_adjust(top = 0.97)
plt.suptitle("Kernel Density Estimation Plots w.r.t. the target 'claim' for {} training examples".format(train_frac.shape[0]), fontsize = 20)
plt.show()
       
    

That's a lot of plots to look at. However, at a quick glance at all the plots, there doesn't seem to be a pattern in any of the distributions w.r.t. the target variable. We will now analyse these weak relations further using a correlation matrix.

**1.4 CORRELATION ANALYSIS**

We noticed earlier that the relation between features and the target variable is most likely weak. To check that further, we'll make use of a correlation matrix. Also, this matrix will help us to check which features are strongly related to one another.

In [None]:
corrMat = train_frac.corr()

fig, ax = plt.subplots(figsize = (20,20))
cmap = sns.diverging_palette(230, 20, as_cmap = True)
mask = np.triu(np.ones_like(corrMat, dtype = bool))
sns.heatmap(corrMat, square = True, annot = False, linewidths = 1, cmap = cmap, mask = mask)

There are a few darker cells, which represent relatively strong correlation between the concerning features/variables. However, even these *relatively* strong correlations have very small correlation coefficient values from a general P.O.V. To elaborate, the slider on the right depicts that the upper bound on positive correlations is approx 0.04 and the lower bound on negative correlations is approx -0.06. These two bounds are too small to declare a strong correlation between the features.

**Ps:** Here, I define a *strong correlation* as one having correlation coefficient value greater than 0.6 (meaning strong positive correlation) or less than -0.6 (meaning strong negative correlation). Of course, these thresholds are subject to the author.

**Conclusion:** We can now safely say that none of the features have a strong correlation among one another, or with the target variable. This marks the end of a fruitless correlation analysis. 

# **2. DATA CLEANING**

**2.0 DATASET SPLIT**

Before proceeding any further, it is recommended to split the dataset into a training set and a hold-out cross-validation set. This is to ensure that the model we build won't be adversely affected by data leakage.

> Any feature whose value would not actually be available in practice at the time you’d want to use the model to make a prediction, is a feature that can introduce leakage to your model

Now, when we talk about splitting the data into train and test sets, we normally have two options:

* Splitting the dataset into a training set and a test set using *train_test_split()*.
* Using k-fold cross-validation sets.

The performance of the two techniques typically depends on the size of the dataset. 
- When we have a small or limited training dataset, then using *K fold CV* is recommended. This is because we are always looking for a method that could maximize the data that we train our model on. Also, for a small training set, *train_test_split* could lead to inconsistent predictions for the test set.
- On the other hand, for a large dataset like the one we're given here, using *K fold* would greatly compromise on the computation speed of our model. Also, since we have a large number of training examples, using only *train_test_split* should be enough information for our model to properly learn the parameters.

So, as a conclusion to the above two points, we will use *train_test_split* for this dataset.

In [None]:
from sklearn.model_selection import train_test_split

X = train_orig.copy()
Y = X.claim
X.drop('claim', axis = 1, inplace = True)

X_train_orig, X_valid_orig, Y_train_orig, Y_valid_orig = train_test_split(X, Y, test_size = 0.2,
                                                                         random_state = 42)
X_test_orig = test_orig.copy()

**2.1 MISSING VALUES**

As we saw earlier, most of the features have missing values. We will take care of that now.

Luckily, for the given dataset, we have only numerical features and hence, imputation will be lot more simpler. For numerical data, two most suitable imputation techniques that could be used here are *mean imputation* and *median imputation*. I will try both these techniques and compare their performance on the validation set. In the final notebook, you will only see the technique which performed better. 

In [None]:
missing_val_cols = X_train_orig.isnull().sum().sort_values(ascending = False)
missing_val_cols = missing_val_cols[missing_val_cols > 0]
ratio_of_missing = missing_val_cols / X_train_orig.shape[0]
missing = pd.concat([missing_val_cols,ratio_of_missing], axis = 1, 
                   keys = ['Count','%'])
missing

Surprisingly, every feature has missing entries. However, the number of missing entries as compared to the entire dataset is quite small.

In [None]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = False)
my_imputer.fit(X_train_orig)
X_train_imputed = my_imputer.transform(X_train_orig)
X_valid_imputed = my_imputer.transform(X_valid_orig)
X_test_imputed = my_imputer.transform(X_test_orig)

# **3. FEATURE SCALING**



Many machine learning algorithms perform better when numerical input variables are scaled to a standard range.

Standardizing is a popular scaling technique that subtracts the mean from values and divides by the standard deviation, transforming the probability distribution for an input variable to a standard Gaussian (zero mean and unit variance). Standardization can become skewed or biased if the input variable contains outlier values.

To overcome this, the median and interquartile range can be used when standardizing numerical input variables, generally referred to as robust scaling.

In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler

robust_scaler = RobustScaler()
robust_scaler.fit(X_train_imputed)
X_train_robust = robust_scaler.transform(X_train_imputed)
X_valid_robust = robust_scaler.transform(X_valid_imputed)
X_test_robust = robust_scaler.transform(X_test_imputed)

standard_scaler = StandardScaler()
standard_scaler.fit(X_train_robust)
X_train_scaled = standard_scaler.transform(X_train_robust)
X_valid_scaled = standard_scaler.transform(X_valid_robust)
X_test_scaled = standard_scaler.transform(X_test_robust)

On passing through the Scalers, our Data Frame has now been converted to a numpy array. So, for convention, we will convert the array back to a Data Frame.

In [None]:
X_train_final = pd.DataFrame(X_train_scaled, index = X_train_orig.index,
                            columns = X_train_orig.columns)
X_valid_final = pd.DataFrame(X_valid_scaled, index = X_valid_orig.index, 
                            columns = X_valid_orig.columns)
X_test_final = pd.DataFrame(X_test_scaled, index = X_test_orig.index, 
                           columns = X_test_orig.columns)

In [None]:
#final training set
X_train_final.describe()

# **4. MODEL FITTING AND EVALUATION**

When we talk about a binary class classification problem, the simplest model that comes to mind is *Logistic Regression*. So, first off, we will use logistic regression as our base model for comparison. Naturally, it won't do very well on such a complex dataset, but it's always better to start from the bottom and build to the top.

**4.0 LOGISTIC REGRESSION**

As a personal observation, I want to see how a simple logistic regression model would perform on a random sample of the dataset as compared to the entire dataset. 

In [None]:
#get a fraction of training set
train_final = pd.concat([X_train_final, Y_train_orig], axis = 1)
X_train_final_frac = train_final.sample(frac = 0.01).reset_index(drop = True)
Y_train_final_frac = X_train_final_frac.claim
X_train_final_frac.drop('claim', axis = 1, inplace = True)

#get a fraction of validation set
valid_final = pd.concat([X_valid_final,Y_valid_orig], axis = 1)
X_valid_final_frac = valid_final.sample(frac = 0.01).reset_index(drop = True)
Y_valid_final_frac = X_valid_final_frac.claim
X_valid_final_frac.drop('claim', axis = 1, inplace = True)

In [None]:
from sklearn.linear_model import LogisticRegression
import time
from sklearn.metrics import roc_auc_score

def get_score(actual, preds):
    score = roc_auc_score(actual,preds)
    return score

lr = LogisticRegression(solver = 'sag')

#Logistic Regression for entire training set
start_full = time.time()
lr.fit(X_train_final,Y_train_orig)
preds_full = lr.predict_proba(X_valid_final)[:,1]
score_full = get_score(Y_valid_orig,preds_full)
print('roc-auc for full fit: {}'.format(score_full))
end_full = time.time()

#Logistic Regression for 1/100th of training set
start_frac = time.time()
lr.fit(X_train_final_frac,Y_train_final_frac)
preds_frac = lr.predict_proba(X_valid_final)[:,1]
score_frac = get_score(Y_valid_orig,preds_frac)
print('roc-auc score for sample fit: {}'.format(score_frac))
end_frac = time.time()

print('full train time: {}\nsample train time: {}'.format(end_full-start_full,end_frac-start_frac))

Seems that the score for simple Logistic Regression fit on a sample of the training set is pretty close to the score for the model fit on the entire training set. But we did cut off on the training time (approx. 100x faster). This might be the cause of having a well balanced dataset w.r.t. the target variable, and thus having taken only a small sample of the dataset we were able to feed much of the useful information about the dataset to our model. 

Of course, this may not be entirely true for more complex models and I don't recommend cutting down on the training examples in such fashion. But, for this dataset only, I believe that if we take a sample of the training set to train our models, then we can train different models for comparison and frequently tweak the hyperparameters without having to wait for long minutes everytime. However, this is just a gamble and I don't recommend adopting this technique for better results.

**4.0.0 NAIVE BAYES**

Another famous classifier is the Naive-Bayes classifier. It has the additional advantage that it is very fast for large datasets such as the one we're working on. 

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

#Naive Bayes for entire training set
nb_full_start = time.time()
nb.fit(X_train_final,Y_train_orig)
nb_full_preds = nb.predict_proba(X_valid_final)[:,1]
nb_full_score = get_score(Y_valid_orig,nb_full_preds)
nb_full_end = time.time()
print('roc-auc for full fit: {}'.format(nb_full_score))

#Naive Bayes for 1/100th of training set
nb_frac_start = time.time()
nb.fit(X_train_final_frac,Y_train_final_frac)
nb_frac_preds = nb.predict_proba(X_valid_final)[:,1]
nb_frac_score = get_score(Y_valid_orig,nb_frac_preds)
nb_frac_end = time.time()
print('roc-auc for sample fit: {}'.format(nb_frac_score))

print('full train time: {}\nsample train time: {}'.format(nb_full_end-nb_full_start,nb_frac_end-nb_frac_start))

We got a tiny 1% improvement in our score. However, this is far from the performance we would expect from our final model. Now let's get serious and train more reasonable models.

**4.1 BASELINE MODELS**

One good practice is to always train a baseline model first on the training set and observe its performance. It gives us a starting step to compare our later models.

In [None]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold

**4.1.1 XGB CLASSIFIER**

In [None]:
xgb = XGBClassifier()

xgb_fit_start = time.time()
xgb.fit(X_train_final,Y_train_orig)
xgb_fit_end = time.time()

#make predictions
xgb_valid_preds = xgb.predict_proba(X_valid_final)[:,1]
xgb_train_preds = xgb.predict_proba(X_train_final)[:,1]

#get scores
xgb_valid_score = get_score(Y_valid_orig,xgb_valid_preds)
xgb_train_score = get_score(Y_train_orig,xgb_train_preds)

xgb_fit_time = xgb_fit_end - xgb_fit_start
xgb_of = xgb_train_score - xgb_valid_score

**4.1.2 CATBOOST CLASSIFIER**

In [None]:
cat = CatBoostClassifier(learning_rate = 0.1)

cat_fit_start = time.time()
cat.fit(X_train_final,Y_train_orig, verbose = False)
cat_fit_end = time.time()

#prediction
cat_valid_preds = cat.predict_proba(pd.DataFrame(X_valid_final))[:,1]
cat_train_preds = cat.predict_proba(pd.DataFrame(X_train_final))[:,1]

#reshape distorted prediction array
cat_valid_preds = cat_valid_preds.reshape(len(cat_valid_preds),)
cat_train_preds = cat_train_preds.reshape(len(cat_train_preds),)

#get scores
cat_valid_score = get_score(Y_valid_orig,cat_valid_preds)
cat_train_score = get_score(Y_train_orig,cat_train_preds)

cat_fit_time = cat_fit_end - cat_fit_start
cat_of = cat_train_score - cat_valid_score

**4.1.3 LIGHT GBM**

In [None]:
lgbm = LGBMClassifier()

lgbm_fit_start = time.time()
lgbm.fit(X_train_final,Y_train_orig)
lgbm_fit_end = time.time()

#make predictions
lgbm_valid_preds= lgbm.predict_proba(X_valid_final)[:,1]
lgbm_train_preds = lgbm.predict_proba(X_train_final)[:,1]

#get score
lgbm_valid_score = get_score(Y_valid_orig,lgbm_valid_preds)
lgbm_train_score = get_score(Y_train_orig,lgbm_train_preds)

lgbm_fit_time = lgbm_fit_end - lgbm_fit_start
lgbm_of = lgbm_train_score - lgbm_valid_score

**4.3 COMPARE PERFORMANCES**

In [None]:
models = [(xgb, 'XGBoost'), (cat, 'CatBoost'), (lgbm, 'LightGBM')]

xgb_eval = {'Model' : 'XGBoost', 'Train Time' : xgb_fit_time,
           'Train Score' : xgb_train_score, 'Val. Score' : xgb_valid_score,
           'Overfitting' : xgb_of}
cat_eval = {'Model' : 'CatBoost', 'Train Time' : cat_fit_time,
           'Train Score' : cat_train_score, 'Val. Score' : cat_valid_score,
           'Overfitting' : cat_of}
lgbm_eval = {'Model' : 'LightGBM', 'Train Time' : lgbm_fit_time,
           'Train Score' : lgbm_train_score, 'Val. Score' : lgbm_valid_score,
           'Overfitting' : lgbm_of}

evaluations = pd.DataFrame({'Model' : [], 'Train Time' : [],
                            'Train Score' : [], 'Val. Score' : [],
                           'Overfitting' : []})
evaluations = evaluations.append([xgb_eval,cat_eval,lgbm_eval], ignore_index = True)
evaluations.set_index('Model', inplace = True)

evaluations

**CONCLUSION**

- None of the models show signs of overfitting.
- Out of the three, *CatBoostClassifier* gave the best results on the Validation Set.
- *LightGBM* is the fastest among the three and took approximately 10x less time to fit the training set as compared to the next fastest model *CatBoostClassifier*.

All things considered, these are pretty decent scores by standards of basline models. If we have to pick one, I think *CatBoostClassifier* should be chosen for prediction on test set. However, if computation speedup is an important priority, then *lightGBM* can be picked for the cost of a little less roc-auc score.

# **5. SUBMISSION**

Let's predict the *claim* variable for test set and submit our results!

In [None]:
X_test_final.shape

In [None]:
#prediction
cat_test_preds = cat.predict_proba(pd.DataFrame(X_test_final))[:,1]

#reshape distorted prediction array
cat_test_preds = cat_test_preds.reshape(len(cat_test_preds),)

In [None]:
output = pd.DataFrame({'id' : X_test_final.index, 'claim' : cat_test_preds})
output.to_csv('submission.csv', index = False)

# **Thank You For Reading!**

This was all for this notebook. In the next notebook, I worked on these baseline models and tried optimizing their performance using **Optuna**. You can find it here --> [TPS Sept: EDA & LightGBM + Optuna](https://www.kaggle.com/jaikr18/tps-sept-detailed-eda-lightgbm-optuna)