# Libraries and Data import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import random

from matplotlib.lines import Line2D

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import IsolationForest

import optuna

# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# import warnings
# warnings.simplefilter(action='ignore', category=UserWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read the data
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv', index_col='id')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col='id')
sample = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv', index_col='id')

In [None]:
train.head()

In [None]:
test.head()


# EDA
EDA base on Notebook https://www.kaggle.com/maximkazantsev/tps-08-21-xgboost Thanks to @maximkazantsev

In [None]:
# Colors to be used for plots

colors = ["lightcoral", "sandybrown", "darkorange", "mediumseagreen",
          "lightseagreen", "cornflowerblue", "mediumpurple", "palevioletred",
          "lightskyblue", "sandybrown", "yellowgreen", "indianred",
          "lightsteelblue", "mediumorchid", "deepskyblue"]

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
pie = ax.pie([len(train), len(test)],
             labels=["Train dataset", "Test dataset"],
             colors=["salmon", "teal"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Dataset length comparison", fontsize=18)
fig.set_facecolor('white')
plt.show();

Let's take a look at the data

In [None]:
train.dtypes

In [None]:
train.describe().T

In [None]:
test.describe().T

# Checking for Categorical Features

In [None]:
train.nunique().sort_values().head(10)

No features looks like to be categorical

In [None]:
(train.claim ==1).sum()

Let's check the distribuition of the Target:

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
pie = ax.pie([(train.claim ==0).sum(), (train.claim ==1).sum()],
             labels=["0", "1"],
             colors=["orange", "blue"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Target distribuition\n", fontsize=18)
fig.set_facecolor('white')
plt.show();

In [None]:
test.columns == train.drop(columns="claim").columns
num_attribs = test.columns

In [None]:
df = pd.concat([train.drop("claim", axis=1), 
                test], axis=0)
columns = num_attribs

cols = 3
rows = len(columns) // cols + 1

fig, axs = plt.subplots(ncols=cols, nrows=rows, figsize=(16,rows*3), sharex=False)

plt.subplots_adjust(hspace = 0.3)
i=0

for r in np.arange(0, rows, 1):
    for c in np.arange(0, cols, 1):
        if i >= len(columns):
            axs[r, c].set_visible(False)
        else:
            hist1 = axs[r, c].hist(train[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="deepskyblue",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Train Dataset")
            hist2 = axs[r, c].hist(test[columns[i]].values,
                                   range=(df[columns[i]].min(),
                                          df[columns[i]].max()),
                                   bins=40,
                                   color="palevioletred",
                                   edgecolor="black",
                                   alpha=0.7,
                                   label="Test Dataset")
            axs[r, c].set_title(columns[i], fontsize=14, pad=5)
            axs[r, c].tick_params(axis="y", labelsize=13)
            axs[r, c].tick_params(axis="x", labelsize=13)
            axs[r, c].grid(axis="y")
            axs[r, c].legend(fontsize=13)
                                  
        i+=1
#plt.suptitle("Feature values distribution in both datasets", y=0.99)
plt.show();

Train and test dataset are quite well balanced

# Correlations

In [None]:
# Plot dataframe
df = train.corr().round(5)

# Mask to hide upper-right part of plot as it is a duplicate
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True

# Making a plot
plt.figure(figsize=(16,16))
ax = sns.heatmap(df, annot=False, mask=mask, cmap="RdBu", annot_kws={"weight": "bold", "fontsize":13})
ax.set_title("Feature correlation heatmap", fontsize=17)
plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
         rotation_mode="anchor", weight="normal")
plt.setp(ax.get_yticklabels(), weight="normal",
         rotation_mode="anchor", rotation=0, ha="right")
plt.show();

There are a very low correlation between features

## Checking how many na values in train and test

In [None]:
print("(train, test) na --> ",(train.isna().sum().sum(), test.isna().sum().sum()))

Quite a lot Na Values to handle. This table require a data cleaning
## Data cleaning

Visualizing wow many Na values for each column

In [None]:
print("% Train NA for each feature, min, max")
train_na = train.drop(columns="claim").isna().sum()
test_na = test.isna().sum()

print(f"There are a minimum of {train_na.min()/len(train)*100} % of NA Values in each features in TRAIN df ")
print(f"There are a maximum of {train_na.max()/len(train)*100} % of NA Values in each features in TRAIN df ")
print("Test NA for each feature, min, max")
print(f"There are a minimum of {test_na.min()/len(test)*100} % of NA Values in each features in TEST df ")
print(f"There are a maximum of {test_na.max()/len(test)*100} % of NA Values in each features in TEST df ")

In [None]:
# Checking if there are concentrated NA values in each id for Train and Test df

na_counts_train = train.isna().sum(axis=1).sort_values(ascending = False).value_counts()

fig, ax = plt.subplots(figsize=(16, 8))

bars = ax.bar(na_counts_train.index,
              na_counts_train.values,
              color=colors,
              edgecolor="black")
ax.set_title("N° missing NA for each id in TRAIN db", fontsize=20, pad=15)
ax.set_ylabel("Missing Values", fontsize=14, labelpad=15)
ax.set_xlabel("N°of missing values for each row", fontsize=14, labelpad=10)
ax.bar_label(bars, [f"{x:2.2f}%" for x in na_counts_train.values/(len(train)/100)],
                 padding=5, fontsize=10, rotation=90)
ax.margins(0.025, 0.12)
ax.grid(axis="y")

plt.show();

There are some id with a maximum of 14 missing feature. 

In [None]:
na_counts_test = test.isna().sum(axis=1).sort_values(ascending = False).value_counts()


fig, ax = plt.subplots(figsize=(16, 8))

bars = ax.bar(na_counts_test.index,
              na_counts_test.values,
              color=colors,
              edgecolor="black")
ax.set_title("N° missing NA for each id in TEST db", fontsize=20, pad=15)
ax.set_ylabel("Missing Values", fontsize=14, labelpad=15)
ax.set_xlabel("N° of id", fontsize=14, labelpad=10)
ax.bar_label(bars, [f"{x:2.2f}%" for x in na_counts_test.values/(len(test)/100)],
                 padding=5, fontsize=10, rotation=90)
ax.margins(0.025, 0.12)
ax.grid(axis="y")

plt.show();

Only the 37% of id have a full stack o features with no NA both in Train and Test set

In [None]:
print("Rows with more then 1 NA in TRAIN df:")
print((train.isna().sum(axis=1)>0).value_counts())
print((train.isna().sum(axis=1)>0).value_counts()/len(train)*100)
print("\nRows with more then 1 NA in TEST df:")
print((test.isna().sum(axis=1)>0).value_counts())
print((test.isna().sum(axis=1)>0).value_counts()/len(test)*100)

In [None]:
X = train.drop(columns = "claim")
y = train.claim
X_test = test.copy()

# Scaling data

In [None]:
x_Mm_scaler = MinMaxScaler()
X = pd.DataFrame(x_Mm_scaler.fit_transform(train.drop("claim", axis=1)),
                 columns=train.drop("claim", axis=1).columns)
X_test = pd.DataFrame(x_Mm_scaler.transform(test), columns=test.columns)


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
xgb_params = {
    'n_estimators': 10000, 
 'learning_rate': 0.1, 
 'subsample': 0.6, 
 'colsample_bytree': 0.5, 
 'max_depth': 6, 
 'booster': 'gbtree', 
 'tree_method': 'gpu_hist', 
 'random_state': 42, 
 'n_jobs': 4}

In [None]:
X.head()

In [None]:
%%time
splits = 6
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_rmse = 0

for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X,y)):
    
    X_train, X_valid = X.loc[train_indicies], X.loc[valid_indicies]
    y_train, y_valid = y.loc[train_indicies], y.loc[valid_indicies]
    print(fold, f"X_train = {X_train.shape} - y_train: {y_train.shape}")
    print(fold, f"X_valid = {X_valid.shape} - y_valid: {y_valid.shape}")
    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              verbose=False)
    print("fitted")
    preds += model.predict(X_test) / splits
    print(preds.shape)
    print("preds ok")
    model_fi += model.feature_importances_
    print("model_fi ok")
    oof_preds[valid_indicies] = model.predict(X_valid)
    print(oof_preds)
    oof_preds[oof_preds < 0] = 0
#     fold_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(np.array(y_valid).reshape(-1,1)), y_scaler.inverse_transform(np.array(oof_preds[valid_idx]).reshape(-1,1))))
    fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_indicies]))
    print(f"Fold {fold} RMSE: {fold_rmse}")
#         print(f"Trees: {model.tree_count_}")
    total_mean_rmse += fold_rmse / splits
print(f"\nOverall RMSE: {total_mean_rmse}") 

In [None]:
# xgb public Score: 
predictions = pd.DataFrame()
predictions["id"] = test.index
predictions["claim"] = preds

predictions.to_csv('submission_xgb_no_NA.csv', index=False, header=predictions.columns)
predictions.head()

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
pie = ax.pie([(predictions.claim < 0.5).sum(), (predictions.claim >= 0.5).sum()],
             labels=["0", "1"],
             colors=["orange", "blue"],
             textprops={"fontsize": 15},
             autopct='%1.1f%%')
ax.axis("equal")
ax.set_title("Target distribuition\n", fontsize=18)
fig.set_facecolor('white')
plt.show();

*****************
WORK IN PROGRESS
*****************
## Filling NA values
There are some ML alghorithms that doesn't support the presence of NA values in dataframe. 
I'd like to verify the efficiency of several methods applying a untuned XGBR:

* Filling all NA qith zeros
* Filling all NA wth the mean value 
* Filling all NA with the median value for each features
* Filling all NA with the median value for each features
* Applying a ML algorith to search an appropriate Value [is it worth it???]

XGBC can handle a DF with NA, so let's try to fit it without NA handler

# XGBC without NA handling

In [None]:
%%time
splits = 6
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_rmse = 0

for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X,y)):
    
    X_train, X_valid = X.loc[train_indicies], X.loc[valid_indicies]
    y_train, y_valid = y.loc[train_indicies], y.loc[valid_indicies]
    print(fold, f"X_train = {X_train.shape} - y_train: {y_train.shape}")
    print(fold, f"X_valid = {X_valid.shape} - y_valid: {y_valid.shape}")
    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              verbose=False)
    print("fitted")
    preds += model.predict(X_test) / splits
    print(preds.shape)
    print("preds ok")
    model_fi += model.feature_importances_
    print("model_fi ok")
    oof_preds[valid_indicies] = model.predict(X_valid)
    print(oof_preds)
    oof_preds[oof_preds < 0] = 0
#     fold_rmse = np.sqrt(mean_squared_error(y_scaler.inverse_transform(np.array(y_valid).reshape(-1,1)), y_scaler.inverse_transform(np.array(oof_preds[valid_idx]).reshape(-1,1))))
    fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_indicies]))
    print(f"Fold {fold} RMSE: {fold_rmse}")
#         print(f"Trees: {model.tree_count_}")
    total_mean_rmse += fold_rmse / splits
print(f"\nOverall RMSE: {total_mean_rmse}") 

# XGBC with Zeros  NA values

We will use sklearn "Simple Imputer".
We will fit the SimpleImputer only in the Train Set. We will aplly it to both Train and Test set to avoid 
https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer_zeros = SimpleImputer(strategy="constant", fill_value = 0)