In [243]:
import streamlit as st
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [244]:
train = pd.read_csv("train.csv")

In [245]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [246]:
test = pd.read_csv("test.csv")

In [247]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [248]:
train.shape

(891, 12)

In [249]:
test.shape

(418, 11)

In [250]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [251]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Data Descriptions

#### Survived
Yes = 1
No = 0

#### pclass
1st Class =1
2nd Class =2
3rd Class =3

#### SibSp
Means Sibling or Spouse aboard the Titanic

#### Parch
Parent or Children aboard the Titanic

#### Cabin
Cabin Number

#### Embarked
Point of Embarking The Titanic
C = Cherbourg
Q = Queens Town
S = SouthHampton

In [252]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [253]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [254]:
train['PassengerId'].head()

0    1
1    2
2    3
3    4
4    5
Name: PassengerId, dtype: int64

In [255]:
train['PassengerId'].tail()

886    887
887    888
888    889
889    890
890    891
Name: PassengerId, dtype: int64

In [256]:
train.drop(["PassengerId"],axis=1,inplace=True)

In [257]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [258]:
train["Pclass"].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [259]:
test["Pclass"].value_counts()

Pclass
3    218
1    107
2     93
Name: count, dtype: int64

In [260]:
sns.countplot(x=train["Pclass"],hue=train["Survived"],data=train)

<Axes: xlabel='Pclass', ylabel='count'>

In [261]:
sns.countplot(x=train["Sex"],hue=train["Survived"],data=train)

<Axes: xlabel='Pclass', ylabel='count'>

In [262]:
sns.countplot(x=train["Embarked"],hue=train["Survived"],data=train)

<Axes: xlabel='Pclass', ylabel='count'>

In [263]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [264]:
train.drop(["Name"],axis=1,inplace=True)

In [265]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [266]:
train["Sex"]

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [267]:
train["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [268]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [269]:
test.drop(["Name"],axis=1,inplace=True)

In [270]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,330911,7.8292,,Q
1,893,3,female,47.0,1,0,363272,7.0,,S
2,894,2,male,62.0,0,0,240276,9.6875,,Q
3,895,3,male,27.0,0,0,315154,8.6625,,S
4,896,3,female,22.0,1,1,3101298,12.2875,,S


In [271]:
test.drop(["PassengerId"],axis=1,inplace=True)

In [272]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,34.5,0,0,330911,7.8292,,Q
1,3,female,47.0,1,0,363272,7.0,,S
2,2,male,62.0,0,0,240276,9.6875,,Q
3,3,male,27.0,0,0,315154,8.6625,,S
4,3,female,22.0,1,1,3101298,12.2875,,S


In [273]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [274]:
train['Sex'].isnull().sum()

0

In [275]:
test["Sex"].isnull().sum()

0

In [276]:
train["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [277]:
pd.crosstab(train["Survived"],train["Sex"])

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,81,468
1,233,109


In [278]:
pd.crosstab(train["Survived"],train["Embarked"])

Embarked,C,Q,S
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,75,47,427
1,93,30,217


In [279]:
# Encoding, Male = 0 and Female =1
def sex_encoder(data):
    mapper={"male":0,"female":1}
    data['Sex'] = data['Sex'].replace(mapper)
    return data

In [280]:
# Lets Encode using the Encode we have set up Already!
sex_encoder(train)

  data['Sex'] = data['Sex'].replace(mapper)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,0,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,1,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,1,35.0,1,0,113803,53.1000,C123,S
4,0,3,0,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,211536,13.0000,,S
887,1,1,1,19.0,0,0,112053,30.0000,B42,S
888,0,3,1,,1,2,W./C. 6607,23.4500,,S
889,1,1,0,26.0,0,0,111369,30.0000,C148,C


In [281]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,0,22.0,1,0,A/5 21171,7.25,,S
1,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,1,35.0,1,0,113803,53.1,C123,S
4,0,3,0,35.0,0,0,373450,8.05,,S


In [282]:
#LETS ENCODE THE TEST DATA SET
sex_encoder(test)

  data['Sex'] = data['Sex'].replace(mapper)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,0,34.5,0,0,330911,7.8292,,Q
1,3,1,47.0,1,0,363272,7.0000,,S
2,2,0,62.0,0,0,240276,9.6875,,Q
3,3,0,27.0,0,0,315154,8.6625,,S
4,3,1,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...
413,3,0,,0,0,A.5. 3236,8.0500,,S
414,1,1,39.0,0,0,PC 17758,108.9000,C105,C
415,3,0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,3,0,,0,0,359309,8.0500,,S


In [283]:
train["Age"].isnull().sum()

177

In [284]:
test["Age"].isnull().sum()

86

In [285]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='median')
train['Age'] = imputer.fit_transform(train[['Age']])
test['Age'] = imputer.fit_transform(test[['Age']])

In [286]:
train['Age'].head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [287]:
train['Age'].isnull().sum()

0

In [288]:
test['Age'].head()

0    34.5
1    47.0
2    62.0
3    27.0
4    22.0
Name: Age, dtype: float64

In [289]:
test['Age'].isnull().sum()

0

In [290]:
sns.kdeplot(x=train['Age'],data=train)

<Axes: xlabel='Pclass', ylabel='count'>

In [291]:
train['SibSp'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 891 entries, 0 to 890
Series name: SibSp
Non-Null Count  Dtype
--------------  -----
891 non-null    int64
dtypes: int64(1)
memory usage: 7.1 KB


In [292]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int64  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     204 non-null    object 
 9   Embarked  889 non-null    object 
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB


In [293]:
train['SibSp'].value_counts()

SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64

In [294]:
train['SibSp'].isnull().sum()

0

In [295]:
pd.crosstab(train['Survived'],train['SibSp'])

SibSp,0,1,2,3,4,5,8
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,398,97,15,12,15,5,7
1,210,112,13,4,3,0,0


In [296]:
pd.crosstab(train['SibSp'],train['Survived'])

Survived,0,1
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1
0,398,210
1,97,112
2,15,13
3,12,4
4,15,3
5,5,0
8,7,0


In [297]:
train.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [298]:
train['Parch'].head()

0    0
1    0
2    0
3    0
4    0
Name: Parch, dtype: int64

In [299]:
train['Parch'].tail()

886    0
887    0
888    2
889    0
890    0
Name: Parch, dtype: int64

In [300]:
train['Parch'].value_counts()

Parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: count, dtype: int64

In [301]:
pd.crosstab(train['Parch'],train['Survived'])

Survived,0,1
Parch,Unnamed: 1_level_1,Unnamed: 2_level_1
0,445,233
1,53,65
2,40,40
3,2,3
4,4,0
5,4,1
6,1,0


In [302]:
pd.crosstab(train['Survived'],train['Parch'])

Parch,0,1,2,3,4,5,6
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,445,53,40,2,4,4,1
1,233,65,40,3,0,1,0


In [303]:
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [304]:
train.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,29.361582,0.523008,0.381594,32.204208
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,0.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [305]:
train['Parch'].describe()

count    891.000000
mean       0.381594
std        0.806057
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64

In [306]:
train['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [307]:
sns.countplot(x=train['SibSp'],hue=train['Survived'],data=train)

<Axes: xlabel='Pclass', ylabel='count'>

In [308]:
sns.countplot(x=train['Parch'],hue=train['Survived'],data=train)

<Axes: xlabel='Pclass', ylabel='count'>

In [309]:
## Lets combine Parch and SibSp to make Family

In [310]:
train['Family'] = train['Parch'] + train['SibSp']

In [311]:
test['Family'] = test['Parch'] + test['SibSp']

In [312]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,0,3,0,22.0,1,0,A/5 21171,7.25,,S,1
1,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C,1
2,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,1,1,1,35.0,1,0,113803,53.1,C123,S,1
4,0,3,0,35.0,0,0,373450,8.05,,S,0


In [313]:
sns.countplot(x=train['Family'],data=train)

<Axes: xlabel='Pclass', ylabel='count'>

In [314]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,0,3,0,22.0,1,0,A/5 21171,7.25,,S,1
1,1,1,1,38.0,1,0,PC 17599,71.2833,C85,C,1
2,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,1,1,1,35.0,1,0,113803,53.1,C123,S,1
4,0,3,0,35.0,0,0,373450,8.05,,S,0


In [315]:
train.drop(['Ticket'],axis=1,inplace=True)

In [316]:
test.drop(['Ticket'],axis=1,inplace=True)

In [317]:
train['Fare'].isnull().sum()

0

In [318]:
test['Fare'].isnull().sum()

1

In [319]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='median')
test['Fare'] = imputer.fit_transform(test[['Fare']])

In [320]:
test['Fare'].isnull().sum()

0

In [321]:
train['Fare'].isnull().sum()

0

In [322]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Family
0,0,3,0,22.0,1,0,7.25,,S,1
1,1,1,1,38.0,1,0,71.2833,C85,C,1
2,1,3,1,26.0,0,0,7.925,,S,0
3,1,1,1,35.0,1,0,53.1,C123,S,1
4,0,3,0,35.0,0,0,8.05,,S,0


In [323]:
train.drop(['Cabin'],axis=1,inplace=True)

In [324]:
test.drop(['Cabin'],axis=1,inplace=True)

In [325]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Family
0,0,3,0,22.0,1,0,7.25,S,1
1,1,1,1,38.0,1,0,71.2833,C,1
2,1,3,1,26.0,0,0,7.925,S,0
3,1,1,1,35.0,1,0,53.1,S,1
4,0,3,0,35.0,0,0,8.05,S,0


In [326]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Family
0,3,0,34.5,0,0,7.8292,Q,0
1,3,1,47.0,1,0,7.0,S,1
2,2,0,62.0,0,0,9.6875,Q,0
3,3,0,27.0,0,0,8.6625,S,0
4,3,1,22.0,1,1,12.2875,S,2


In [327]:
train['Embarked'].isnull().sum()

2

In [328]:
test['Embarked'].isnull().sum()

0

In [329]:
train['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [330]:
#Lets Fill the Missing Values of Test with C

In [331]:
train['Embarked'].fillna(value="C",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Embarked'].fillna(value="C",inplace=True)


In [332]:
train.fillna({'Embarked':'C'},inplace=True)

In [333]:
train['Embarked'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 891 entries, 0 to 890
Series name: Embarked
Non-Null Count  Dtype 
--------------  ----- 
891 non-null    object
dtypes: object(1)
memory usage: 7.1+ KB


In [334]:
train['Embarked'].describe()

count     891
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [335]:
pd.crosstab(train['Embarked'],train['Survived'])

Survived,0,1
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,75,95
Q,47,30
S,427,217


In [336]:
pd.crosstab(train['Survived'],train['Embarked'])

Embarked,C,Q,S
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,75,47,427
1,95,30,217


In [337]:
### Lets Turn Embarked Into separate columns and convert to Numbers

def embarked_encoder(data):
    df=pd.get_dummies(data=data["Embarked"],prefix='Embarked')
    data=pd.concat([data,df],axis=1) 
    data.drop(["Embarked"],axis=1,inplace=True)
    return data
    
train=embarked_encoder(train)
test=embarked_encoder(test)

In [338]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Family,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,1,False,False,True
1,1,1,1,38.0,1,0,71.2833,1,True,False,False
2,1,3,1,26.0,0,0,7.925,0,False,False,True
3,1,1,1,35.0,1,0,53.1,1,False,False,True
4,0,3,0,35.0,0,0,8.05,0,False,False,True


In [339]:
### Lets turn the True to one and False to 0

In [342]:
def number_encoder(data):
    mapper={"False":0,"True":1}
    data['Embarked_C'] = data['Embarked_C'].replace(mapper)
    return data

In [343]:
number_encoder(train)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Family,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.2500,1,False,False,True
1,1,1,1,38.0,1,0,71.2833,1,True,False,False
2,1,3,1,26.0,0,0,7.9250,0,False,False,True
3,1,1,1,35.0,1,0,53.1000,1,False,False,True
4,0,3,0,35.0,0,0,8.0500,0,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,0,False,False,True
887,1,1,1,19.0,0,0,30.0000,0,False,False,True
888,0,3,1,28.0,1,2,23.4500,3,False,False,True
889,1,1,0,26.0,0,0,30.0000,0,True,False,False


In [344]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Family,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,1,False,False,True
1,1,1,1,38.0,1,0,71.2833,1,True,False,False
2,1,3,1,26.0,0,0,7.925,0,False,False,True
3,1,1,1,35.0,1,0,53.1,1,False,False,True
4,0,3,0,35.0,0,0,8.05,0,False,False,True


In [345]:
def number_encoderQ(data):
    mapper={"False":0,"True":1}
    data['Embarked_Q'] = data['Embarked_Q'].replace(mapper)
    return data

In [348]:
number_encoderQ(train)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Family,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.2500,1,False,False,True
1,1,1,1,38.0,1,0,71.2833,1,True,False,False
2,1,3,1,26.0,0,0,7.9250,0,False,False,True
3,1,1,1,35.0,1,0,53.1000,1,False,False,True
4,0,3,0,35.0,0,0,8.0500,0,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,0,False,False,True
887,1,1,1,19.0,0,0,30.0000,0,False,False,True
888,0,3,1,28.0,1,2,23.4500,3,False,False,True
889,1,1,0,26.0,0,0,30.0000,0,True,False,False


In [349]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Family,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.0,1,0,7.25,1,False,False,True
1,1,1,1,38.0,1,0,71.2833,1,True,False,False
2,1,3,1,26.0,0,0,7.925,0,False,False,True
3,1,1,1,35.0,1,0,53.1,1,False,False,True
4,0,3,0,35.0,0,0,8.05,0,False,False,True


In [350]:
train_x = train[["Pclass","Sex","Age","Family","Fare"]]
train_y = train[["Survived"]]

In [351]:
from sklearn.model_selection import train_test_split

In [367]:
tr_x,cv_x,tr_y,cv_y = train_test_split(train_x,train_y,test_size=0.30)

In [368]:
tr_x.head()

Unnamed: 0,Pclass,Sex,Age,Family,Fare
632,1,0,32.0,0,30.5
245,1,0,44.0,2,90.0
541,3,1,9.0,6,31.275
579,3,0,32.0,0,7.925
580,2,1,25.0,2,30.0


In [369]:
tr_y.head()

Unnamed: 0,Survived
632,1
245,0
541,0
579,1
580,1


In [370]:
tr_x["Family"].unique()

array([ 0,  2,  6,  5,  1,  4,  3,  7, 10])

# THE MACHINE LEARNING ALGORITHMS

In [371]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [372]:
rf = RandomForestClassifier ()

In [373]:
lgr = LogisticRegression ()

# Fitting The Model

In [374]:
rf.fit(tr_x,tr_y)

  return fit_method(estimator, *args, **kwargs)


In [375]:
lgr.fit(tr_x,tr_y)

  y = column_or_1d(y, warn=True)


# Accuracy of The Model

In [376]:
accuracy_rf = rf.score(cv_x,cv_y)

In [377]:
accuracy_rf

0.8246268656716418

In [378]:
accuracy_lgr = lgr.score(cv_x,cv_y)

In [379]:
accuracy_lgr

0.8059701492537313

# Lets Predict

In [380]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Family,Embarked_C,Embarked_Q,Embarked_S
0,3,0,34.5,0,0,7.8292,0,False,True,False
1,3,1,47.0,1,0,7.0,1,False,False,True
2,2,0,62.0,0,0,9.6875,0,False,True,False
3,3,0,27.0,0,0,8.6625,0,False,False,True
4,3,1,22.0,1,1,12.2875,2,False,False,True


In [381]:
test_x = test[["Pclass","Sex","Age","Family","Fare"]]

In [382]:
test_x.head()

Unnamed: 0,Pclass,Sex,Age,Family,Fare
0,3,0,34.5,0,7.8292
1,3,1,47.0,1,7.0
2,2,0,62.0,0,9.6875
3,3,0,27.0,0,8.6625
4,3,1,22.0,2,12.2875


### Predict with The Random Forest

In [384]:
rf_prediction = rf.predict(test_x)

In [385]:
rf_prediction

array([0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,

###  This is the Survival prediction, so , Lets Get this values into the Survived Column

In [387]:
test["Survived_from_prediction"] = rf_prediction

In [388]:
test["Survived_from_prediction"].head()

0    0
1    0
2    1
3    1
4    1
Name: Survived_from_prediction, dtype: int64

In [389]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Family,Embarked_C,Embarked_Q,Embarked_S,Survived_from_prediction
0,3,0,34.5,0,0,7.8292,0,False,True,False,0
1,3,1,47.0,1,0,7.0,1,False,False,True,0
2,2,0,62.0,0,0,9.6875,0,False,True,False,1
3,3,0,27.0,0,0,8.6625,0,False,False,True,1
4,3,1,22.0,1,1,12.2875,2,False,False,True,1


### Lets save into a CSV file

In [392]:
submission = pd.DataFrame({"Survived":rf_prediction})

In [393]:
submission

Unnamed: 0,Survived
0,0
1,0
2,1
3,1
4,1
...,...
413,0
414,1
415,0
416,0


In [394]:
submission.to_csv("Submission_02.csv",index=False)

## LETS SAVE OUR MODEL 

In [395]:
import pickle

In [396]:
pickle_out = open("titanic_model","wb")
pickle.dump(lgr,pickle_out)
pickle_out.close()