In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Acquring data

In [None]:
train_Data = pd.read_csv('../input/titanic/train.csv')
test_Data = pd.read_csv('../input/titanic/test.csv')
combine = [train_Data, test_Data]

train_Data.head()

# Data Description #

In [None]:
train_Data.describe()

# Data Exploration/Analysis

In [None]:
train_Data.isnull().sum()

In [None]:
train_Data.info()

In [None]:
train_Data.columns.values


In [None]:
total = train_Data.isnull().sum().sort_values(ascending=False)
percent_1 = train_Data.isnull().sum()/train_Data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(15)

From the above table- Cabin has 687, Age has 177 and Embarked has 2 missing values.

# Data Preprocessing

We can drop parameters such as Name, Ticket, Cabin, PassengerID which do not contribute towards our goal of prediction 

In [None]:
train_Data = train_Data.drop(['Name','Ticket','Cabin','PassengerId'], axis=1)
test_Data = test_Data.drop(['Name','Ticket','Cabin'], axis=1)
combine = [train_Data, test_Data]

conversion of gender parameter to integer value

In [None]:
train_Data['Sex'] = train_Data['Sex'].map({'male':0, 'female':1}).astype(int)
test_Data['Sex'] = test_Data['Sex'].map({'male':0, 'female':1}).astype(int)
combine = [train_Data, test_Data]

train_Data.head(8)

In [None]:
train_Data.Embarked.mode()

Handling Emabarked data by filling and mapping 

In [None]:
train_Data['Embarked'] = train_Data['Embarked'].fillna('S')
test_Data['Embarked'] = test_Data['Embarked'].fillna('S')


train_Data['Embarked'] = train_Data['Embarked'].map({'C':0, 'Q':1, 'S':2}).astype(int)
test_Data['Embarked'] = test_Data['Embarked'].map({'C':0, 'Q':1, 'S':2}).astype(int)
combine = [train_Data, test_Data]

train_Data.head()

In [None]:
total = train_Data.isnull().sum().sort_values(ascending=False)
percent_1 = train_Data.isnull().sum()/train_Data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(20)

Handling Age, Sibling and Parch data

In [None]:
train_Data['family'] = train_Data['SibSp'] + train_Data['Parch'] + 1
test_Data['family'] = test_Data['SibSp'] + test_Data['Parch'] + 1

train_Data.head()

In [None]:
knn_imputer = KNNImputer()

imp_Train_Data = knn_imputer.fit_transform(train_Data[['Fare','SibSp','Age']])
imp_Test_Data = knn_imputer.fit_transform(test_Data[['Fare','SibSp','Age']])

train_Data['Age'] = imp_Train_Data[:,2]
test_Data['Age'] = imp_Test_Data[:,2]

Verifying..

In [None]:
total = train_Data.isnull().sum().sort_values(ascending=False)
percent_1 = train_Data.isnull().sum()/train_Data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(15)

Hsndling Fare data

In [None]:
test_Data['Fare'].fillna(value=test_Data.Fare.mean(),inplace=True)
train_Data['Fare'] = train_Data['Fare']/train_Data['Fare'].abs().max()
test_Data['Fare'] = test_Data['Fare']/test_Data['Fare'].abs().max()
train_Data.describe()

In [None]:
total = train_Data.isnull().sum().sort_values(ascending=False)
percent_1 = train_Data.isnull().sum()/train_Data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(15)

In [None]:
test_Data['Age'].fillna(value=test_Data.Age.mean(),inplace=True)
train_Data['Age'] = train_Data['Age']/train_Data['Age'].abs().max()
test_Data['Age'] = test_Data['Age']/test_Data['Age'].abs().max()
train_Data.describe()

From above table, we can see that 38% of the passengers survived. 

In [None]:
train_Data[['family', 'Survived']].groupby(['family'], as_index=False).mean().sort_values(by='family', ascending=False)

In [None]:
combine = [train_Data,test_Data]
for dataset in combine:
    dataset.loc[ dataset['family'] <= 1, 'family'] = 0
    dataset.loc[(dataset['family'] > 1) & (dataset['family'] <= 4), 'family'] = 1
    dataset.loc[(dataset['family'] > 4), 'family']   = 2
    dataset['family'] = dataset['family'].astype(int)

combine_Data = [train_Data, test_Data]

train_Data.head()

In [None]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train_Data, Y_train_Data)
Y_prediction = svc.predict(X_test_Data)
SVM = round(svc.score(X_train_Data, Y_train_Data) * 100, 2)
SVM


In [None]:
Hyperparameters = {"n_neighbors": range(1, 50)}
grid_Search = GridSearchCV(KNeighborsClassifier(), Hyperparameters)
grid_Search.fit(X_train_Data, Y_train_Data)

grid_Search.best_params_

In [None]:
kNN = KNeighborsClassifier(n_neighbors = 14)
kNN.fit(X_train_Data, Y_train_Data)
Y_prediction = kNN.predict(X_test_Data)
KNN = round(kNN.score(X_train_Data, Y_train_Data) * 100, 2)
kNN.score(X_train_Data, Y_train_Data)

In [None]:
# Decision Tree

decision_Tree = DecisionTreeClassifier()
decision_Tree.fit(X_train_Data, Y_train_Data)
Y_prediction = decision_Tree.predict(X_test_Data)
DecisionTree= round(decision_Tree.score(X_train_Data, Y_train_Data) * 100, 2)
DecisionTree

In [None]:
# Random Forest

RF = RandomForestClassifier(n_estimators=1000, min_samples_split = 10, max_depth=5)
RF.fit(X_train_Data, Y_train_Data)
Y_pred = RF.predict(X_test_Data)
RF.score(X_train_Data, Y_train_Data)
RANDOMFOREST = round(RF.score(X_train_Data, Y_train_Data) * 100, 2)
RANDOMFOREST

In [None]:
gbm = xgb.XGBClassifier(learning_rate = .15, max_depth=7,subsample = .7)
gbm.fit(X_train_Data, Y_train_Data)
gmbScore = round(gbm.score(X_train_Data, Y_train_Data) *100, 2)
gmbScore

In [None]:
perceptron = Perceptron(max_iter=6)
perceptron.fit(X_train_Data, Y_train_Data)

Y_prediction = perceptron.predict(X_test_Data)

acc_perceptron = round(perceptron.score(X_train_Data, Y_train_Data) * 100, 2)
acc_perceptron