# Importing libraries📚

In [None]:
import torch
import torchvision
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor
from torchvision.utils import make_grid
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import random_split
import pandas as pd
import seaborn as sns
from colorama import Fore, Back, Style
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# Getting data 💽

In [None]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")
submission = pd.read_csv("../input/titanic/gender_submission.csv")

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.isnull().sum()

# EDA 📊

In [None]:
red = Fore.RED
grn = Fore.GREEN
blu = Fore.BLUE
ylw = Fore.YELLOW
wht = Fore.WHITE

In [None]:
def plot_distribution(feature,color):
    plt.figure(dpi=125)
    sns.distplot(train[feature],color=color);
    print("{}Max value of {} is {}\n{}Min value of {} is {}\n{}Mean value of {} is {}\n{}Std value of {} is {}\n{}Median value of {} is {}".format(red,feature,train[feature].max(),blu,feature,train[feature].min(),grn,feature,train[feature].mean(),ylw,feature,train[feature].std(),wht,feature,train[feature].median()));

In [None]:
plot_distribution('Age','green')

As we know from the disaster.. women and children were the first to be evacuated.. mean age is 29.6, median age is 28.. both of which are >18 which suggests the people with missing ages are adults ... standard deviation is 14.5 which is <18 which suggests they are chilren..

In [None]:
sns.set(style = 'darkgrid')
plt.figure(dpi=125)
sns.countplot(x=train.Sex, hue=train.Survived, data=train,edgecolor = sns.color_palette('dark',2));

As we can confirm females had a greater survival rate compared to males.

In [None]:
plt.figure(dpi=125)
sns.countplot(x = train.Sex, hue = train.Pclass,data = train,edgecolor = sns.color_palette('dark',3));

There were many people from both genders in 3rd class(as expected)... the second most filled class was 1st class ... this maybe suggests huge price difference between classes.. not sure though 🤷‍♂️🤷‍♂️

In [None]:
plt.figure(dpi=125)
sns.countplot(x = train.Survived, hue = train.Pclass, data = train,edgecolor = sns.color_palette('dark',1));

Most people survived were from 1st class.. so people in 1st class had a greater chance of survival

In [None]:
plt.figure(dpi=125)
sns.countplot(x = train.Pclass, hue = train.Survived, data = train,edgecolor = sns.color_palette('dark',5));

Only people from 1st class had a greater survival rate compared to classes 2 and 3... maybe there were fewer people in 1st class

In [None]:
train.groupby('Pclass').Survived.value_counts()

Well there weren't fewer people in 1st class.. so we can conclude that people in first class did indeed had a higher chance and rate of survival

In [None]:
plt.figure(dpi=125)
sns.countplot(x = train.Embarked, hue = train.Sex, data = train,edgecolor = sns.color_palette('dark',6));

In [None]:
plt.figure(dpi=125)
sns.countplot(x = train.Embarked, hue = train.Survived, data = train);

More people survived from Cherbourg compared to other two

### Name

In [None]:
train['Name'] = train.Name.str.extract('([A-Za-z]+)\.',expand = False)

In [None]:
plt.figure(dpi=200)
plt.xticks(size=5)
sns.countplot(x = train.Name, hue = train.Survived, data = train);

In [None]:
top6 = train['Name'].value_counts()[:6].index.to_list()
top6

In [None]:
train['Name'] = train['Name'].apply(lambda x: x if x in top6 else 'Other')

In [None]:
train.groupby('Name').Survived.value_counts()

### Family

In [None]:
train['family'] = train['SibSp'] + train['Parch'] + 1

In [None]:
plt.figure(dpi=125)
sns.countplot(x = train.family, hue = train.Survived, data = train);

In [None]:
train.groupby('family').Survived.value_counts()

This seems kinda random .. only people with family member size of 2,3,4 survived greater than the rest 🤔🤔.

In [None]:
for i in range(len(train)):
    if(train['family'][i] > 1):
        train['family'][i] = 1
    else:
        train['family'][i] = 0

In [None]:
plt.figure(dpi=125)
sns.countplot(x = train.family, hue = train.Survived, data = train);

people with family had greater rate of survival

### Cabin

In [None]:
train.groupby('Cabin').Survived.value_counts()

In [None]:
train['Cabin'].fillna('S',inplace=True)

In [None]:
for i in range(len(train)):
    train['Cabin'][i] = train['Cabin'][i][0]

In [None]:
train.groupby('Cabin').Survived.value_counts()

In [None]:
plt.figure(dpi = 125)
sns.countplot(x = train.Cabin,hue = train.Survived, data = train);

people from cabins were more likely to survive

### Fare

In [None]:
plt.figure(dpi=125)
plot_distribution('Fare','orange')

The minimum fare is 0.0 which means there was/were someone/some people with a free ride in titanic(probably in 1st class) 😅😅

In [None]:
train['fare_val'] = 0
for i in range(len(train)):
    if(train['Fare'][i] > 32.0):
        train['fare_val'][i] = 1

In [None]:
train.groupby('fare_val').Survived.value_counts()

In [None]:
plt.figure(dpi=125)
sns.countplot(x = train.fare_val, hue = train.Survived, data = train);

People with greater fare had a higher rate of survival

#### Making same modifications to test dataset

In [None]:
#family
test['family'] = test['SibSp'] + test['Parch'] + 1
for i in range(len(test)):
    if(test['family'][i] > 1):
        test['family'][i] = 1
    else:
        test['family'][i] = 0

#Name
test['Name'] = test['Name'].apply(lambda x: x if x in top6 else 'Other')

#Cabin
test['Cabin'].fillna('S',inplace=True)

for i in range(len(test)):
    test['Cabin'][i] = test['Cabin'][i][0]


#Fare
test['fare_val'] = 0
for i in range(len(test)):
    if(test['Fare'][i] > 32.0):
        test['fare_val'][i] = 1

# Data Preprocessing 🗄️

In [None]:
features = [##'PassengerId',
            'Pclass',
            #'Name',
            'Sex',
            'Age',
            ##'SibSp',
            ##'Parch',
            'family',#derived from SibSp & Parch
            #'Ticket',
            ##'Fare',
            'fare_val',#derived from Fare
            #'Cabin',
            'Embarked'
           ]

target = 'Survived'

In [None]:
train[features].isnull().sum()

In [None]:
test[features].isnull().sum()

In [None]:
'''Age_mean = train['Age'].mean()
train['Age'] = train['Age'].fillna(value = Age_mean)

Age_mean_t = test['Age'].mean()
test['Age'] = test['Age'].fillna(value = Age_mean_t)
f"'train',{Age_mean}, 'test',{Age_mean_t}"''';

In [None]:
Age_std = train['Age'].std()
train['Age'] = train['Age'].fillna(value = Age_std)

Age_std_t = test['Age'].std()
test['Age'] = test['Age'].fillna(value = Age_std_t)
f"'train',{Age_std}, 'test',{Age_std_t}"

In [None]:
from sklearn.preprocessing import LabelEncoder

lbl = LabelEncoder()

train['Sex'] = lbl.fit_transform(train[['Sex']].values.ravel())
test['Sex'] = lbl.fit_transform(test[['Sex']].values.ravel())

In [None]:
#lbl2 = LabelEncoder()
#train['Name'] = lbl2.fit_transform(train[['Name']].values.ravel())
#test['Name'] = lbl2.fit_transform(test[['Name']].values.ravel())

In [None]:
train['Embarked'] = train['Embarked'].fillna(value=train['Embarked'].mode()[0])
test['Embarked'] = test['Embarked'].fillna(value=test['Embarked'].mode()[0])


In [None]:
train_ds = train[features]
test_ds = test[features]

In [None]:
train_ds = pd.get_dummies(columns = ['Embarked','Pclass'],data=train_ds,drop_first = True)
test_ds = pd.get_dummies(columns = ['Embarked','Pclass'],data=test_ds,drop_first = True)

In [None]:
print(train_ds.head())
train_ds.shape

In [None]:
print(test_ds.head())
test_ds.shape

In [None]:
#train_ds.drop(columns = ['Cabin_T'],inplace = True)

# Creating a XGBoost Model with Randomized Search ❤️‍🔥

In [None]:
y_train = train[target]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_ds, y_train, test_size=0.30)

In [None]:
X_train.columns

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV,GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
'''rfc = RandomForestClassifier()

params = {'n_estimators': [200,500,800,1000,1200],
          'max_depth': [3,5,7],
          'criterion':['entropy', 'gini'],
          'min_samples_leaf' : [1, 2, 3, 4, 5],
          'max_features':['auto'],
          'min_samples_split': [3, 5, 10],
          'max_leaf_nodes':[2,3,5,7],
          }

rfc_cv = RandomizedSearchCV(rfc, params, cv = 250, n_jobs=-1, verbose=2).fit(X_train, y_train)''';

In [None]:
rfc = XGBClassifier()

params = {'n_estimators': [200,500,800,1000,1200],
          'max_depth': [3,5,7],
          'objective' : ['binary:logistic'],
          'min_samples_leaf' : [1, 2, 3, 4, 5],
          'max_leaf_nodes':[2,3,5,7],
          'min_child_weight': [1, 5, 10],
          'gamma': [0.5, 1, 1.5, 2, 5],
          }

rfc_cv = RandomizedSearchCV(rfc, params, cv = 10, n_jobs=-1, verbose=2).fit(X_train, y_train)

In [None]:
rfc_cv.best_params_
best_model = rfc_cv.best_estimator_

print(best_model)
print(rfc_cv.best_score_)

In [None]:
rfc_pred = best_model.predict(X_valid)

print("Accuracy: ", accuracy_score(y_valid, rfc_pred))

print("\nConfusion Matrix\n")
print(confusion_matrix(y_valid, rfc_pred))

## Saving the Model 💾

In [None]:
import pickle

filename = 'Titanic_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

## Loading the Model 🔃

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_valid, y_valid)
print(result)

In [None]:
print(rfc_pred)

In [None]:
passId = test[['PassengerId']].values

In [None]:
test_ds.head()

In [None]:
final_pred = best_model.predict(test_ds)

# My Submission 🙋‍♂️

In [None]:
sub = {'PassengerId':passId.ravel(), 'Survived':final_pred}

In [None]:
submission_csv = pd.DataFrame(sub)

In [None]:
submission_csv.head()

In [None]:
submission_csv.to_csv('final_sub_titanic_xgb_cv_10.csv',index = False)

In [None]:
x = pd.read_csv("./final_sub_titanic_xgb_cv_10.csv")

In [None]:
x.head()

## Checkout my other [**Notebook**](https://www.kaggle.com/mdhamani/titanic-getting-better-eda-pytorch-gpu-top-14) with PyTorch Neural Network Classifier

# To-Do📋
## Tuning parameters 🤷‍♂️🤷‍♂️
## Make the PyTorch Model more accurate