# Importing libraries📚

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn import metrics, model_selection
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
print(f'Train shape: {train.shape}, \nTest shape: {test.shape}, \nSubmission shape: {sample_sub.shape}')

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.isna().sum()

> We have large number of missing values.

# Reducing memory usage

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# Let's plot `claim`

In [None]:
plt.rcParams["figure.figsize"] = (12, 5)
ax = train['claim'].value_counts().sort_values().plot(kind="barh")
totals= []
for i in ax.patches:
    totals.append(i.get_width())
total = sum(totals)
for i in ax.patches:
     ax.text(i.get_width()+.3, i.get_y()+.20, 
     str(round((i.get_width()/total)*100, 2))+'%', 
     fontsize=10, color='black')
ax.grid(axis="x")
plt.suptitle('Claim', fontsize=20)
plt.show()

# Missing value imputation

In [None]:
'''
>>> from sklearn.impute import SimpleImputer
>>> imp = SimpleImputer(missing_values=np.nan, strategy='mean')
>>> imp.fit([[1, 2], [np.nan, 3], [7, 6]])
SimpleImputer()
>>> X = [[np.nan, 2], [6, np.nan], [7, 6]]
>>> print(imp.transform(X))
[[4.          2.        ]
 [6.          3.666...]
 [7.          6.        ]]
'''

In [None]:
y = train.claim.astype(int)

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train = imputer.fit_transform(train.drop(['id', 'claim'], axis=1))
test = imputer.transform(test.drop('id', axis=1))

# KFold 

In [None]:
kfold = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
test_preds = []
oof_auc = []
tprs, aucs = [], []
mean_fpr = np.linspace(0,1,100)

for fold, (train_idx, val_idx) in enumerate(kfold.split(train, y)):
    X_train, y_train = train[train_idx, :], y[train_idx]
    X_val, y_val = train[val_idx, :], y[val_idx]

    model = XGBClassifier(n_estimators=500, random_state=fold,predictor='gpu_predictor',
                          tree_method='gpu_hist',eval_metric = 'auc')
    model.fit(X_train, y_train)
    
    preds = model.predict_proba(X_val)[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y_val, preds)
    tprs.append(scipy.interp(mean_fpr, fpr, tpr))
    roc_auc = metrics.auc(fpr, tpr)
    oof_auc.append(roc_auc)
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (fold, roc_auc))

    test_preds.append(model.predict(test))
    
plt.plot([0,1], [0,1], linestyle='--', lw=2, color='black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = metrics.auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue', label=r'Mean ROC (AUC=%0.2f )'%(mean_auc), lw=2, alpha=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('K-Fold Validation')
plt.legend(loc="lower right")
plt.show()

# Submission

In [None]:
mode = scipy.stats.mode(test_preds)

In [None]:
sample_sub.claim = np.array(mode[0]).reshape(-1, 1)
sample_sub.head()

In [None]:
sample_sub.to_csv('submission.csv', index=None)