# 🚀 Import Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder,OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 🚀 Data Loading

## 👉 Load train and test dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
test.head()

## 👉 Create another copy of train dataset

In [None]:
train_copy = train.copy()

## 👉 Drop 'Survived' feature from train dataset

In [None]:
train.drop('Survived', axis=1, inplace = True)

## 👉 Concating train and test dataset

In [None]:
data = pd.concat([train,test], ignore_index=True)
data.head()

# 🚀 Exploratory Data Analysis (EDA)

## 👉 Check data information

In [None]:
data.info()

## 👉 Check data description

In [None]:
data.describe()

## 👉 Check null values in dataset

In [None]:
data.isnull().sum()

## 👉 Visualizing the features

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(x='Survived', data=train_copy)

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(x='Sex', data=train_copy)

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(x='Embarked', data=train_copy)

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(x='Pclass', data=train_copy)

In [None]:
sns.pairplot(train_copy, vars=['Age','Fare','SibSp','Parch'], hue='Survived', 
             kind='scatter',diag_kind='hist', corner= True, dropna=True )

In [None]:
sns.catplot(x="Embarked", y="Survived",  kind="violin", data=train_copy)

# 🚀 Data Preprocessing

## 👉 Changing datatype of 'Pclass' from int to object

In [None]:
data['Pclass'] = data['Pclass'].astype('object')

## 👉 Imputing missing values for all features

In [None]:
age_mean = data['Age'].mean()
data['Age'].fillna(age_mean, inplace = True)
data = data.round({'Age' : 2})

In [None]:
fare_mean = data['Fare'].mean()
data['Fare'].fillna(fare_mean, inplace = True)

In [None]:
data['Embarked'].fillna('X', inplace = True)

In [None]:
data['Ticket'].fillna('X', inplace = True)

In [None]:
data['Cabin'].fillna('X', inplace = True)

# 🚀 Feature Engineering

## 👉 Create new feature 'First' and 'Last' name from 'Name'

In [None]:
data[['First','Last']] = data.Name.str.split(',', expand=True)

## 👉 Create new feature 'Class' from 'Cabin'

In [None]:
data['Class'] = data['Cabin'].str[:1]

## 👉 Create new feature 'Family' from 'SibSp' and 'Parch'

In [None]:
data['Family'] = data['SibSp'] + data['Parch'] + 1

## 👉 Drop unwanted columns 

In [None]:
data.drop(['Name','First','Cabin','PassengerId'], axis=1, inplace = True)

# 🚀 Feature Selection

In [None]:
cor = data.corr()
sns.heatmap(cor, annot = True)

#### Note: We will use all the features in model building since there are no strong correlation between independent features. (threshold = 0.9)

# 🚀 Data Transformation

In [None]:
data.head()

## 👉 Label Encoding for 'Ticket', 'Last' and 'Class' features

In [None]:
label = LabelEncoder()
data['Ticket'] = label.fit_transform(data['Ticket'])
data['Last'] = label.fit_transform(data['Last'])
data['Class'] = label.fit_transform(data['Class'])

## 👉 One-Hot Encoding for 'Sex' and 'Embarked' features

In [None]:
data_onehot = pd.get_dummies(data[['Sex','Embarked']])
data_onehot.head()

## 👉 Ordinal Encoding for 'Pclass' feature

In [None]:
ordinal = OrdinalEncoder()
data['Pclass'] = label.fit_transform(data['Pclass'])

## 👉 Concating One-Hot encoded features with original data

In [None]:
data = pd.concat([data,data_onehot], axis=1)
data.head()

## 👉 Dropping unwanted features

In [None]:
data.drop(['Sex','Embarked'], axis=1, inplace=True)

In [None]:
data.head()

## 👉 Separating the train and test data

In [None]:
train = data[:100000]
test = data[100000:]

## 👉 Concating 'Survived' column from train_copy to train dataset

In [None]:
train = pd.concat([train,train_copy['Survived']], axis=1)
train.head()

## 👉 Separating Independent and dependent features from train data

In [None]:
X = train.iloc[:,0:15]
y = train.iloc[:,15:]

## 👉 Splitting the train data into training and validation

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🚀 Model Building

## 👉 Logistic Regression

### 🏹 Building pipeline for Logistic Regression model

In [None]:
pipe_log_reg = Pipeline([
    ('scaler',StandardScaler()),
    ('log_reg', LogisticRegression())
])

In [None]:
pipe_log_reg.fit(X_train,y_train)

In [None]:
logistic_regression_score = pipe_log_reg.score(X_test,y_test)
logistic_regression_score

### 🏹 hyperparameter tuning for logistic regression pipeline

In [None]:
parameters = {
    'log_reg__penalty':['l1', 'l2', 'elasticnet', 'none'],
    'log_reg__C': [0.001,0.01,0.1,1.0,10,100],
    'log_reg__solver' : [ 'liblinear']
}

In [None]:
log_reg_search = GridSearchCV(pipe_log_reg, param_grid=parameters)
log_reg_search.fit(X_train,y_train)

In [None]:
log_reg_search.best_params_

In [None]:
log_reg_search.best_score_

In [None]:
pipe_log_reg_tuned = log_reg_search.best_estimator_
print(pipe_log_reg_tuned)

### 🏹 Modeling using the new parameter for Logistic Regression

In [None]:
pipe_log_reg_tuned.fit(X_train,y_train)

In [None]:
logistic_regression_tuned_score = pipe_log_reg_tuned.score(X_test,y_test)
logistic_regression_tuned_score

## 👉 Random Forest Classifier

### 🏹 Building pipeline for Random Forest Classifier model

In [None]:
pipe_rf = Pipeline([
    ('scaler',StandardScaler()),
    ('rf', RandomForestClassifier())
])

In [None]:
pipe_rf.fit(X_train,y_train)

In [None]:
random_forest_score = pipe_rf.score(X_test,y_test)
random_forest_score

### 🏹 hyperparameter tuning for Random Forest Classifier pipeline

In [None]:
parameters = {
    
    'rf__n_estimators' : [100,150] , 
    'rf__max_depth' : [5, 10, 15],  
    'rf__min_samples_split' : [2, 5], 
    'rf__min_samples_leaf' : [1, 2]
}

In [None]:
rf_search = GridSearchCV(pipe_rf, param_grid=parameters)
rf_search.fit(X_train,y_train)

In [None]:
rf_search.best_params_

In [None]:
rf_search.best_score_

In [None]:
pipe_rf_tuned = rf_search.best_estimator_
print(pipe_rf_tuned)

### 🏹 Modeling using the new parameter for Random Forest Classifier

In [None]:
pipe_rf_tuned.fit(X_train,y_train)

In [None]:
rf_tuned_score = pipe_rf_tuned.score(X_test,y_test)
rf_tuned_score

## 👉 LightGBM

### 🏹 Building pipeline for LightGBM model

In [None]:
pipe_lgbm = Pipeline([
    ('scaler',StandardScaler()),
    ('lgbm', LGBMClassifier())
])

In [None]:
pipe_lgbm.fit(X_train,y_train)

In [None]:
lgbm_score = pipe_lgbm.score(X_test,y_test)
lgbm_score

### 🏹 hyperparameter tuning for LightGBM pipeline

In [None]:
parameters = {
    'lgbm__objective': ['binary'],
    'lgbm__boosting_type' : ['gbdt'],
    'lgbm__learning_rate' : [0.0001,0.001,0.01,0.1],
    'lgbm__num_leaves': [40,50,60,70],
    'lgbm__min_child_samples' : [10,20,30],
    #'lgbm__min_child_weight' : [0.01, 0.001, 0.0001, 0.00001],
    #'lgbm__reg_alpha' : [0.1, 0.2,0.4, 0.6, 0.8],
    #'lgbm__reg_lambda' : [0.1, 0.2,0.4, 0.6, 0.8]
}

In [None]:
lgbm_search = GridSearchCV(pipe_lgbm, param_grid=parameters)
lgbm_search.fit(X_train,y_train)

In [None]:
lgbm_search.best_params_

In [None]:
lgbm_search.best_score_

In [None]:
pipe_lgbm_tuned = lgbm_search.best_estimator_
print(pipe_lgbm_tuned)

### 🏹 Modeling using the new parameter for LightGBM

In [None]:
pipe_lgbm_tuned.fit(X_train,y_train)

In [None]:
lgbm_tuned_score = pipe_rf_tuned.score(X_test,y_test)
lgbm_tuned_score

# 🚀  Model Metrics

## 👉 Accuracy score and Confusion matrix for Logistic regression

In [None]:
lr_y_pred = pipe_log_reg.predict(X_test)

In [None]:
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_accuracy

In [None]:
lr_cm = confusion_matrix(y_test, lr_y_pred)
lr_cm

In [None]:
lr_tuned_y_pred = pipe_log_reg_tuned.predict(X_test)

In [None]:
lr_tuned_accuracy = accuracy_score(y_test, lr_tuned_y_pred)
lr_tuned_accuracy

In [None]:
lr_cm = confusion_matrix(y_test, lr_tuned_y_pred)
lr_cm

## 👉 Accuracy score and Confusion matrix for Random Forest Classifier

In [None]:
rf_y_pred = pipe_rf.predict(X_test)

In [None]:
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_accuracy

In [None]:
rf_cm = confusion_matrix(y_test, rf_y_pred)
rf_cm

In [None]:
rf_tuned_y_pred = pipe_rf_tuned.predict(X_test)

In [None]:
rf_tuned_accuracy = accuracy_score(y_test, rf_tuned_y_pred)
rf_tuned_accuracy

In [None]:
rf_tuned_cm = confusion_matrix(y_test, rf_tuned_y_pred)
rf_tuned_cm

## 👉  Accuracy score and Confusion matrix for LightGBM

In [None]:
lgbm_y_pred = pipe_lgbm.predict(X_test)

In [None]:
lgbm_accuracy = accuracy_score(y_test, lgbm_y_pred)
lgbm_accuracy

In [None]:
lgbm_cm = confusion_matrix(y_test, lgbm_y_pred)
lgbm_cm

In [None]:
lgbm_tuned_y_pred = pipe_lgbm_tuned.predict(X_test)

In [None]:
lgbm_tuned_accuracy = accuracy_score(y_test, lgbm_tuned_y_pred)
lgbm_tuned_accuracy

In [None]:
lgbm_tuned_cm = confusion_matrix(y_test, lgbm_tuned_y_pred)
lgbm_tuned_cm

# 🚀 Model Selection

## 👉 Compare the accuracy for all models

In [None]:
accuracy_df = pd.DataFrame(data = {'Out-of-Box': [lr_accuracy,rf_accuracy,lgbm_accuracy],
                                  'Tuned': [lr_tuned_accuracy,rf_tuned_accuracy,lgbm_tuned_accuracy],
                                  'Models': ['Logistic Regression', 'Random Forest', 'LightGBM']})
accuracy_df.set_index('Models', inplace = True)
accuracy_df

#### We will select LightGBM model since, it gave us the best accuracy

# 🚀 Predicting test data using LightGBM, Random Forest Classfier and Logistic regression model

In [None]:
test_lgbm_y_pred = pipe_lgbm.predict(test)
test_lgbm_y_pred

In [None]:
test_rf_y_pred = pipe_rf.predict(test)
test_rf_y_pred

In [None]:
test_lr_y_pred = pipe_log_reg.predict(test)
test_lr_y_pred

# 🚀 Submissions of all models

In [None]:
submission_lgbm = pd.DataFrame({
    'PassengerId' : train_copy['PassengerId'],
    'Survived': test_lgbm_y_pred})
submission_lgbm.head()

In [None]:
submission_lgbm.to_csv('submission_lgbm.csv', index=False)

In [None]:
submission_rf = pd.DataFrame({
    'PassengerId' : train_copy['PassengerId'],
    'Survived': test_rf_y_pred})
submission_rf.head()

In [None]:
submission_rf.to_csv('submission_rf.csv', index=False)

In [None]:
submission_lr = pd.DataFrame({
    'PassengerId' : train_copy['PassengerId'],
    'Survived': test_lr_y_pred})
submission_lr.head()

In [None]:
submission_lr.to_csv('submission_lr.csv', index=False)

# 🚀 Submitted model accuracies

In [None]:
test_accuracy = pd.DataFrame({'Models' : ['LightGBM', 'Logistic Regression', 'Random Forest'],
                             'Accuracy': [0.79369, 0.78211, 0.76883]})
test_accuracy.set_index('Models', inplace = True)
test_accuracy