In [668]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Acquring data

In [669]:
train_Data = pd.read_csv('../input/titanic/train.csv')
test_Data = pd.read_csv('../input/titanic/test.csv')
combine = [train_Data, test_Data]

train_Data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Description #

In [670]:
train_Data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Data Exploration/Analysis

In [671]:
train_Data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [672]:
train_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [673]:
train_Data.columns.values


array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [674]:
total = train_Data.isnull().sum().sort_values(ascending=False)
percent_1 = train_Data.isnull().sum()/train_Data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(15)

Unnamed: 0,Total,%
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
PassengerId,0,0.0
Survived,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
SibSp,0,0.0
Parch,0,0.0


From the above table- Cabin has 687, Age has 177 and Embarked has 2 missing values.

# Data Preprocessing

We can drop parameters such as Name, Ticket, Cabin, PassengerID which do not contribute towards our goal of prediction 

In [675]:
train_Data = train_Data.drop(['Name','Ticket','Cabin','PassengerId'], axis=1)
test_Data = test_Data.drop(['Name','Ticket','Cabin'], axis=1)
combine = [train_Data, test_Data]

conversion of gender parameter to integer value

In [676]:
train_Data['Sex'] = train_Data['Sex'].map({'male':0, 'female':1}).astype(int)
test_Data['Sex'] = test_Data['Sex'].map({'male':0, 'female':1}).astype(int)
combine = [train_Data, test_Data]

train_Data.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S
5,0,3,0,,0,0,8.4583,Q
6,0,1,0,54.0,0,0,51.8625,S
7,0,3,0,2.0,3,1,21.075,S


In [677]:
train_Data.Embarked.mode()

0    S
dtype: object

Handling Emabarked data by filling and mapping 

In [678]:
train_Data['Embarked'] = train_Data['Embarked'].fillna('S')
test_Data['Embarked'] = test_Data['Embarked'].fillna('S')


train_Data['Embarked'] = train_Data['Embarked'].map({'C':0, 'Q':1, 'S':2}).astype(int)
test_Data['Embarked'] = test_Data['Embarked'].map({'C':0, 'Q':1, 'S':2}).astype(int)
combine = [train_Data, test_Data]

train_Data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,2
1,1,1,1,38.0,1,0,71.2833,0
2,1,3,1,26.0,0,0,7.925,2
3,1,1,1,35.0,1,0,53.1,2
4,0,3,0,35.0,0,0,8.05,2


In [679]:
total = train_Data.isnull().sum().sort_values(ascending=False)
percent_1 = train_Data.isnull().sum()/train_Data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(20)

Unnamed: 0,Total,%
Age,177,19.9
Survived,0,0.0
Pclass,0,0.0
Sex,0,0.0
SibSp,0,0.0
Parch,0,0.0
Fare,0,0.0
Embarked,0,0.0


Handling Age, Sibling and Parch data

In [680]:
train_Data['family'] = train_Data['SibSp'] + train_Data['Parch'] + 1
test_Data['family'] = test_Data['SibSp'] + test_Data['Parch'] + 1

train_Data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,family
0,0,3,0,22.0,1,0,7.25,2,2
1,1,1,1,38.0,1,0,71.2833,0,2
2,1,3,1,26.0,0,0,7.925,2,1
3,1,1,1,35.0,1,0,53.1,2,2
4,0,3,0,35.0,0,0,8.05,2,1


In [681]:
knn_imputer = KNNImputer()

imp_Train_Data = knn_imputer.fit_transform(train_Data[['Fare','SibSp','Age']])
imp_Test_Data = knn_imputer.fit_transform(test_Data[['Fare','SibSp','Age']])

train_Data['Age'] = imp_Train_Data[:,2]
test_Data['Age'] = imp_Test_Data[:,2]

Verifying..

In [682]:
total = train_Data.isnull().sum().sort_values(ascending=False)
percent_1 = train_Data.isnull().sum()/train_Data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(15)

Unnamed: 0,Total,%
Survived,0,0.0
Pclass,0,0.0
Sex,0,0.0
Age,0,0.0
SibSp,0,0.0
Parch,0,0.0
Fare,0,0.0
Embarked,0,0.0
family,0,0.0


Hsndling Fare data

In [683]:
test_Data['Fare'].fillna(value=test_Data.Fare.mean(),inplace=True)
train_Data['Fare'] = train_Data['Fare']/train_Data['Fare'].abs().max()
test_Data['Fare'] = test_Data['Fare']/test_Data['Fare'].abs().max()
train_Data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,family
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,30.122678,0.523008,0.381594,0.062858,1.536476,1.904602
std,0.486592,0.836071,0.47799,13.410387,1.102743,0.806057,0.096995,0.791503,1.613459
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,1.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,0.01544,1.0,1.0
50%,0.0,3.0,0.0,29.0,0.0,0.0,0.028213,2.0,1.0
75%,1.0,3.0,1.0,38.0,1.0,0.0,0.060508,2.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,1.0,2.0,11.0


In [684]:
total = train_Data.isnull().sum().sort_values(ascending=False)
percent_1 = train_Data.isnull().sum()/train_Data.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(15)

Unnamed: 0,Total,%
Survived,0,0.0
Pclass,0,0.0
Sex,0,0.0
Age,0,0.0
SibSp,0,0.0
Parch,0,0.0
Fare,0,0.0
Embarked,0,0.0
family,0,0.0


In [685]:
test_Data['Age'].fillna(value=test_Data.Age.mean(),inplace=True)
train_Data['Age'] = train_Data['Age']/train_Data['Age'].abs().max()
test_Data['Age'] = test_Data['Age']/test_Data['Age'].abs().max()
train_Data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,family
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,0.376533,0.523008,0.381594,0.062858,1.536476,1.904602
std,0.486592,0.836071,0.47799,0.16763,1.102743,0.806057,0.096995,0.791503,1.613459
min,0.0,1.0,0.0,0.00525,0.0,0.0,0.0,0.0,1.0
25%,0.0,2.0,0.0,0.275,0.0,0.0,0.01544,1.0,1.0
50%,0.0,3.0,0.0,0.3625,0.0,0.0,0.028213,2.0,1.0
75%,1.0,3.0,1.0,0.475,1.0,0.0,0.060508,2.0,2.0
max,1.0,3.0,1.0,1.0,8.0,6.0,1.0,2.0,11.0


From above table, we can see that 38% of the passengers survived. 

In [686]:
train_Data[['family', 'Survived']].groupby(['family'], as_index=False).mean().sort_values(by='family', ascending=False)

Unnamed: 0,family,Survived
8,11,0.0
7,8,0.0
6,7,0.333333
5,6,0.136364
4,5,0.2
3,4,0.724138
2,3,0.578431
1,2,0.552795
0,1,0.303538


In [687]:
combine = [train_Data,test_Data]
for dataset in combine:
    dataset.loc[ dataset['family'] <= 1, 'family'] = 0
    dataset.loc[(dataset['family'] > 1) & (dataset['family'] <= 4), 'family'] = 1
    dataset.loc[(dataset['family'] > 4), 'family']   = 2
    dataset['family'] = dataset['family'].astype(int)

combine_Data = [train_Data, test_Data]

train_Data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,family
0,0,3,0,0.275,1,0,0.014151,2,1
1,1,1,1,0.475,1,0,0.139136,0,1
2,1,3,1,0.325,0,0,0.015469,2,0
3,1,1,1,0.4375,1,0,0.103644,2,1
4,0,3,0,0.4375,0,0,0.015713,2,0


In [688]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train_Data, Y_train_Data)
Y_prediction = svc.predict(X_test_Data)
SVM = round(svc.score(X_train_Data, Y_train_Data) * 100, 2)
SVM


80.92

In [689]:
Hyperparameters = {"n_neighbors": range(1, 50)}
grid_Search = GridSearchCV(KNeighborsClassifier(), Hyperparameters)
grid_Search.fit(X_train_Data, Y_train_Data)

grid_Search.best_params_

{'n_neighbors': 14}

In [690]:
kNN = KNeighborsClassifier(n_neighbors = 14)
kNN.fit(X_train_Data, Y_train_Data)
Y_prediction = kNN.predict(X_test_Data)
KNN = round(kNN.score(X_train_Data, Y_train_Data) * 100, 2)
kNN.score(X_train_Data, Y_train_Data)

0.8170594837261503

In [691]:
# Decision Tree

decision_Tree = DecisionTreeClassifier()
decision_Tree.fit(X_train_Data, Y_train_Data)
Y_prediction = decision_Tree.predict(X_test_Data)
DecisionTree= round(decision_Tree.score(X_train_Data, Y_train_Data) * 100, 2)
DecisionTree

98.2

In [692]:
# Random Forest

RF = RandomForestClassifier(n_estimators=1000, min_samples_split = 10, max_depth=5)
RF.fit(X_train_Data, Y_train_Data)
Y_pred = RF.predict(X_test_Data)
RF.score(X_train_Data, Y_train_Data)
RANDOMFOREST = round(RF.score(X_train_Data, Y_train_Data) * 100, 2)
RANDOMFOREST

84.74

In [693]:
gbm = xgb.XGBClassifier(learning_rate = .15, max_depth=7,subsample = .7)
gbm.fit(X_train_Data, Y_train_Data)
gmbScore = round(gbm.score(X_train_Data, Y_train_Data) *100, 2)
gmbScore

94.84

In [706]:
perceptron = Perceptron(max_iter=6)
perceptron.fit(X_train_Data, Y_train_Data)

Y_prediction = perceptron.predict(X_test_Data)

acc_perceptron = round(perceptron.score(X_train_Data, Y_train_Data) * 100, 2)
acc_perceptron



78.23