# Hi kagglers 🙋🏻‍♂️ and Welcome to this new competition!

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import random
from sklearn.metrics import mean_squared_error,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from lightgbm import LGBMClassifier

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import preprocessing
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')

# Exploratory Data Analysis

In [None]:
print('Train size: ', len(train))
print('Test size: ', len(test))

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
#Check if there'is null values
train.isnull().sum()

In [None]:
#Check if there'is null values
test.isnull().sum()

In [None]:
train.SibSp.value_counts()

In [None]:
train.Parch.value_counts()

In [None]:
train.Cabin.describe()

In [None]:
train.Ticket.describe()

In [None]:
# Numerical features distribution 
i = 1
plt.figure()
fig, ax = plt.subplots(1, 2,figsize=(20, 8))
for feature in ['Age','Fare']:
    plt.subplot(1, 2,i)
    sns.histplot(train[feature],color="blue", kde=True,bins=100, label='train')
    sns.histplot(test[feature],color="olive", kde=True,bins=100, label='test')
    plt.xlabel(feature, fontsize=9); plt.legend()
    i += 1
plt.show()

## Impute missing values

In [None]:
#complete embarked with mode
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace = True)
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace = True)

#complete sex with mode
train['Sex'].fillna(train['Sex'].mode()[0], inplace = True)
test['Sex'].fillna(test['Sex'].mode()[0], inplace = True)

#complete missing age with mean
train['Age'].fillna(train['Age'].mean(), inplace = True)
test['Age'].fillna(test['Age'].mean(), inplace = True)

#complete missing fare with mean
train['Fare'].fillna(test['Fare'].median(), inplace = True)
test['Fare'].fillna(test['Fare'].median(), inplace = True)

In [None]:
# Categorical features distribution 
i = 1
plt.figure()
fig, ax = plt.subplots(3, 2,figsize=(20, 16))
for feature in ['Sex','Embarked','SibSp','Pclass','Parch']:
    plt.subplot(3, 2,i)
    sns.histplot(train[feature],color="blue", label='train')
    sns.histplot(test[feature],color="olive", label='test')
    plt.xlabel(feature, fontsize=9); plt.legend()
    i += 1
plt.show()

In [None]:
# Target distibution
sns.catplot(x="Survived", kind="count", palette="ch:.25", data=train)

* As we can see the data is unbalanced that's why I'll use StratifiedKFold to split data

* I'm not going to consider the attributes Cabin, Ticket and PassengerId as important features in our training data

In [None]:
columns = [c for c in train.columns if c not in ['PassengerId','Cabin','Ticket','Survived','Name']]

In [None]:
#Features correlation
corr = train[columns+['Survived']].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

# One Hot Encoding for Encoding Categorical Features

In [None]:
train_objs_num = len(train)
dataset = pd.concat(objs=[train[columns], test[columns]], axis=0)
dataset_preprocessed = pd.get_dummies(dataset,columns=['Sex','Embarked','Parch','SibSp'])
train_preprocessed = dataset_preprocessed[:train_objs_num]
test_preprocessed = dataset_preprocessed[train_objs_num:]

In [None]:
train_preprocessed

In [None]:
test_preprocessed

# Modeling

In [None]:
params = {'reg_alpha': 0.025698237956455088,
 'reg_lambda': 0.2384750191428652,
 'colsample_bytree': 0.9,
 'subsample': 0.4,
 'bagging_freq': 4,
 'learning_rate': 0.02,
 'max_depth': 100,
 'num_leaves': 36,
 'min_child_samples': 271,
 'cat_smooth': 55,
 'random_state': 48,
 'n_estimators': 20000,
 'metric': 'binary_logloss',
 "objective": "binary"}

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold
preds = np.zeros(test.shape[0])        
kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True)                  
accuracy=[]   # list contains accuracy for each fold  
n=0   
for trn_idx, test_idx in kf.split(train[columns],train['Survived']):
    X_tr,X_val=train_preprocessed.iloc[trn_idx],train_preprocessed.iloc[test_idx]
    y_tr,y_val=train['Survived'].iloc[trn_idx],train['Survived'].iloc[test_idx]
    model = LGBMClassifier(**params) 
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=200,verbose=False) 
    sub[str(n)] = model.predict(test_preprocessed)
    accuracy.append(accuracy_score(y_val, model.predict(X_val))) 
    print(n+1,accuracy[n])                                                                                       
    n+=1 

In [None]:
np.mean(accuracy)

In [None]:
# most 10 important features for lgb model
from optuna.integration import lightgbm as lgb
lgb.plot_importance(model, max_num_features=10, figsize=(10,10))
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, model.predict(X_val) ))

# Making a Submission

In [None]:
sub

In [None]:
df=sub[['0','1','2','3','4']].mode(axis=1) # select the most frequent predicted class by our model
sub['Survived']=df[0]    
sub=sub[['PassengerId','Survived']]
sub['Survived']=sub['Survived'].apply(lambda x : int(x))
sub.to_csv('submission.csv',index=False)

In [None]:
sub

# I hope that you find this kernel usefull🏄