In [1]:
# Importing required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MaxNLocator

from scipy import interp
import math
from scipy.stats import norm
from scipy import stats

import warnings
warnings.filterwarnings('ignore') # Disabling warnings for clear outputs

pd.options.display.max_columns = 50 # Pandas option to increase max number of columns to display

plt.style.use('ggplot') # Setting default plot style

In [25]:
# Importing the datasets

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

idx = len(train)

In [26]:
display(train.sample(3))
display(test.sample(3))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0,B51 B53 B55,S
820,821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
156,157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16.0,0,0,35851,7.7333,,Q


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
117,1009,3,"Sandstrom, Miss. Beatrice Irene",female,1.0,1,1,PP 9549,16.7,G6,S
158,1050,1,"Borebank, Mr. John James",male,42.0,0,0,110489,26.55,D22,S


In [27]:
# Merging the dataset

train.drop('PassengerId', axis = 1, inplace = True)
test.drop('PassengerId', axis = 1, inplace = True)
merged_data = pd.concat([train,test], sort = False).reset_index(drop = True)

In [28]:
# viewing the shape of merged data
display(merged_data.shape)

(1309, 11)

In [33]:
# Viewing the feature and target columns 
display(merged_data.columns)

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [35]:
# checking the dtypes
display(merged_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   int64  
 6   Parch     1309 non-null   int64  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 112.6+ KB


None

## Exploring the Data

#### Categorical Features

* **Survived**: It is the target variable, which represents whether the people survived or not.

In [137]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,C


In [147]:
data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005007,-0.035144,0.042939,0.033207,-0.057527,-0.001652,0.012658,0.012985
Survived,-0.005007,1.0,-0.338481,-0.543351,-0.069809,-0.035322,0.081629,0.257307,-0.176509
Pclass,-0.035144,-0.338481,1.0,0.1319,-0.331339,0.083081,0.018443,-0.5495,0.173511
Sex,0.042939,-0.543351,0.1319,1.0,0.084153,-0.114631,-0.245489,-0.182333,0.118492
Age,0.033207,-0.069809,-0.331339,0.084153,1.0,-0.232625,-0.179191,0.091566,-0.03961
SibSp,-0.057527,-0.035322,0.083081,-0.114631,-0.232625,1.0,0.414838,0.159651,0.07148
Parch,-0.001652,0.081629,0.018443,-0.245489,-0.179191,0.414838,1.0,0.216225,0.043351
Fare,0.012658,0.257307,-0.5495,-0.182333,0.091566,0.159651,0.216225,1.0,-0.230365
Embarked,0.012985,-0.176509,0.173511,0.118492,-0.03961,0.07148,0.043351,-0.230365,1.0


In [141]:
data['Embarked'] = data['Embarked'].fillna('0')
data['Embarked'].isnull().any()
data['Embarked'].value_counts()

S    644
C    168
Q     77
0      2
Name: Embarked, dtype: int64

In [143]:
type(data['Embarked'])
data['Embarked']= lbl_en.fit_transform(data['Embarked'])

In [144]:
va = data['Age'].mean()
va

29.69911764705882

In [145]:
data['Age'] = data['Age'].fillna(va)

In [146]:
data['Fare'].isnull().any()
X['Age'].isnull().any()

False

In [539]:
X=data[['Sex','Pclass','Fare','Age']]
y=data['Survived']


In [540]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [541]:
cv = ShuffleSplit(n_splits = 10, test_size = 0.1, random_state=42)

In [542]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter = 1000)
LR.fit(X,y)
scores = cross_val_score(LR,X,y, cv = cv)

In [556]:
from sklearn.svm import SVC
svc = SVC(kernel="sigmoid")
svc.fit(X,y)
scores_1 = cross_val_score(svc,X,y, cv = cv)

In [562]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=3, weights = "distance")
knc.fit(X,y)
scores_2 = cross_val_score(knc,X,y,cv=cv)

In [550]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion="entropy",splitter = "random")
dtc.fit(X,y)
scores_3 = cross_val_score(dtc,X,y, cv = cv)

In [546]:
print("LR Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

LR Accuracy: 0.81 (+/- 0.05)


In [547]:
print("SVM Accuracy: %0.2f (+/- %0.2f)"%(scores_1.mean(),scores_1.std()*2))

SVM Accuracy: 0.58 (+/- 0.12)


In [548]:
print("KNN Accuracy: %0.2f (+/- %0.2f)"%(scores_3.mean(),scores_3.std()*2))

KNN Accuracy: 0.80 (+/- 0.07)


In [549]:
print("DTC Accuracy: %0.2f (+/- %0.2f)"%(scores_3.mean(),scores_3.std()*2))

DTC Accuracy: 0.80 (+/- 0.07)


In [495]:
test = pd.read_csv("test.csv")

In [496]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [497]:
test['Sex'] = lbl_en.fit_transform(test['Sex'])

In [499]:
test['Embarked'] = lbl_en.fit_transform(test['Embarked'])

In [513]:
m = test['Fare'].mean()
test['Fare']=test['Fare'].fillna(m)
a = test['Age'].mean()
test['Age']=test['Age'].fillna(a)

In [528]:
test_X=  test[['Sex','Pclass','Fare','Age']]

In [529]:
test_X.isnull().any()

Sex       False
Pclass    False
Fare      False
Age       False
dtype: bool

In [563]:
test_pred = knc.predict(test_X)

In [564]:
test_pred

array([0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,

In [565]:
final_output = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':test_pred})

In [566]:
final_output = final_output.set_index('PassengerId')

In [567]:
final_output.to_csv("Submission.csv")

In [569]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-1.0.0-py3-none-any.whl (188 kB)
Collecting shap==0.32.1
  Downloading shap-0.32.1-cp37-cp37m-win_amd64.whl (292 kB)
Collecting scikit-learn==0.22
  Downloading scikit_learn-0.22-cp37-cp37m-win_amd64.whl (6.2 MB)
Collecting kmodes==0.10.1
  Downloading kmodes-0.10.1-py2.py3-none-any.whl (17 kB)
Collecting lightgbm==2.3.1
  Downloading lightgbm-2.3.1-py2.py3-none-win_amd64.whl (544 kB)
Collecting catboost==0.20.2
  Downloading catboost-0.20.2-cp37-none-win_amd64.whl (63.1 MB)
Collecting wordcloud
  Downloading wordcloud-1.7.0-cp37-cp37m-win_amd64.whl (157 kB)
Collecting pyod
  Downloading pyod-0.8.0.tar.gz (93 kB)
Collecting pyLDAvis
  Downloading pyLDAvis-2.1.2.tar.gz (1.6 MB)
Collecting pandas-profiling==2.3.0
  Downloading pandas-profiling-2.3.0.tar.gz (127 kB)
Collecting mlxtend
  Downloading mlxtend-0.17.2-py2.py3-none-any.whl (1.3 MB)
Collecting cufflinks==0.17.0
  Downloading cufflinks-0.17.0.tar.gz (81 kB)
Collecting awscli
  Downloading a

ERROR: Could not install packages due to an EnvironmentError: [WinError 5] Access is denied: 'e:\\anaconda-navigator\\lib\\site-packages\\~klearn\\decomposition\\_cdnmf_fast.cp37-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



Collecting catalogue<1.1.0,>=0.0.7
  Downloading catalogue-1.0.0-py2.py3-none-any.whl (7.7 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.2-cp37-cp37m-win_amd64.whl (20 kB)
Collecting smart-open>=1.8.1
  Downloading smart_open-2.0.0.tar.gz (103 kB)
Collecting Cython==0.29.14
  Downloading Cython-0.29.14-cp37-cp37m-win_amd64.whl (1.7 MB)
Collecting tbb
  Downloading tbb-2019.0-py3-none-win_amd64.whl (194 kB)
Collecting zope.interface
  Downloading zope.interface-5.1.0-cp37-cp37m-win_amd64.whl (194 kB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Collecting regex>=2017.02.08
  Downloading regex-2020.6.8-cp37-cp37m-win_amd64.whl (268 kB)
Collecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting boto3
  Downloading boto3-1.14.10-py2.py3-none-any.whl (128 kB)
Building wheels for collected packages: pyod, pyLDAvis, pandas-profiling, cufflinks, umap-learn, combo, suod, funcy, htmlmin, confuse, wasab