# Hi kagglers 🙋🏻‍♂️ and Welcome to this competition!

## Let's import some libraries


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import random
from xgboost import XGBClassifier
from scipy import stats
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold,KFold

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import numpy as np 
import pandas as pd 
from xgboost import plot_importance,plot_tree,to_graphviz

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

# Let's do some Exploratory Data Analysis (EDA)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
#Check if there'is null values
train.isnull().sum()

In [None]:
#Check if there'is null values
test.isnull().sum()

In [None]:
train.describe()

In [None]:
cols=['f'+str(i) for i in range(1,119)]

In [None]:
# Numerical features distribution: part1
i = 1
plt.figure()
fig, ax = plt.subplots(15, 4,figsize=(20, 22))
for feature in cols[:60]:
    plt.subplot(15, 4,i)
    sns.histplot(train[feature],color="blue", kde=True,bins=100, label='train_'+feature)
    sns.histplot(test[feature],color="olive", kde=True,bins=100, label='test_'+feature)
    plt.xlabel(feature, fontsize=9); plt.legend()
    i += 1
plt.show()

In [None]:
# Numerical features distribution: part2
i = 1
plt.figure()
fig, ax = plt.subplots(15, 4,figsize=(20, 22))
for feature in cols[60:]:
    plt.subplot(15, 4,i)
    sns.histplot(train[feature],color="blue", kde=True,bins=100, label='train_'+feature)
    sns.histplot(test[feature],color="olive", kde=True,bins=100, label='test_'+feature)
    plt.xlabel(feature, fontsize=9); plt.legend()
    i += 1
plt.show()

In [None]:
#Features correlation
corr = train[cols+['claim']].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

### Target distribution

In [None]:
train.claim.value_counts()

In [None]:
# Target distibution
sns.catplot(x="claim", kind="count", palette="ch:.25", data=train)

# let's try XGBoost

In [None]:
params = {'objective':'binary:logistic',
'tree_method':'gpu_hist',
'random_state': 48,
'n_estimators': 30000,
'lambda': 0.18427562536318878,
'alpha': 0.027053317312588015,
'colsample_bytree': 0.1,
'subsample': 0.8,
'learning_rate': 0.01,
'max_depth': 7,
'min_child_weight': 14,
'use_label_encoder':False}

In [None]:
preds = np.zeros(test.shape[0])
kf = StratifiedKFold(n_splits=10,random_state=48,shuffle=True)
auc=[]  # list contains auc for each fold
n=0
for trn_idx, test_idx in kf.split(train[cols],train['claim']):
    X_tr,X_val=train[cols].iloc[trn_idx],train[cols].iloc[test_idx]
    y_tr,y_val=train['claim'].iloc[trn_idx],train['claim'].iloc[test_idx]
    model = XGBClassifier(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],eval_metric = 'auc',early_stopping_rounds=100,verbose=False)
    preds += model.predict_proba(test[cols])[:,1]/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    print(f"fold: {n+1}, auc: {auc[n]}")
    n+=1 

In [None]:
np.mean(auc)  

In [None]:
# let's plot most 40 important features
fig, ax = plt.subplots(figsize=(10,10))
plot_importance(model, max_num_features=40, height=0.5, ax=ax,importance_type='weight')
plt.show()

#### The [XGBoost Python API](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.plotting) provides a function for plotting decision trees within a trained XGBoost model.
Let's plot the 100th boosted tree

In [None]:
fig, ax = plt.subplots(figsize=(44, 44))
plot_tree(model, num_trees=99,rankdir='LR', ax=ax)
plt.show()

# Making a Submission

In [None]:
sub['claim']=preds
sub.to_csv('submission.csv', index=False)

In [None]:
sub

# I hope that you find this kernel usefull🏄