# Tutorial with titanic Dataset

In [1]:
# Importing necessary libraries 
import numpy as np # to work with numerical values from time to time
import pandas as pd# main library for cleaning and processing the dataset
import matplotlib.pyplot as plt # EDA (for visualizing the patterns through graphs)
import seaborn as sns # EDA (for visualizing the patterns through graphs) 

In [2]:
train = pd.read_csv('train.csv') # opening and assigning the train dataset to train dataframe
test = pd.read_csv('test.csv') # opening and assigning the test dataset to test dataframe

In [3]:
# .shape gives a tuple of the two dimension's length (row and column)
print('the shape of train dataset is', train.shape) 
print('the shape of train dataset is', test.shape)

the shape of train dataset is (891, 12)
the shape of train dataset is (418, 11)


In [4]:
train.head(3) # first 3 rows of the train dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [5]:
test.head(3) # first 3 rows of the train dataset

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


### Understanding what we are supposed to do:


*What we have:*

Train dataset has the additional column survived, where 1 means the passenger survived and 0 means the passenger died;
  
*What we should do:*

Test dataset has no such column, which we are expected to **predict** whether those passengers on test dataset **survived or not**.

In [6]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [7]:
print(train.isna().sum()) # checking missing values
test.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
#Cabin has many missing values, let's see;
train['Cabin'].value_counts()

B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64

In [9]:
# As the existing values also unique, we can drop the Cabin column from both datasets
for df in (train, test): #dropping with less code
    df.drop('Cabin', axis = 1, inplace = True)

In [10]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C


In [11]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S


Cabin columns is gone now

# Splitting the text and taking necessary info from it

In [12]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S



We have a 'Name' column which is of course different for every single passenger *(meaning unique)*;

Such columns should be dropped because they are not useful;

But we can also see that they have their title in the text, such as Mr, Miss...

So we can first their title, then drop the 'Name' column;

Let's start!


In [13]:
train['title'] = train['Name'].str.split(',', expand = True)[1].str.split('.', expand = True)[0]

In [14]:
train['title'].value_counts()

 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Mlle              2
 Major             2
 Col               2
 the Countess      1
 Capt              1
 Ms                1
 Sir               1
 Lady              1
 Mme               1
 Don               1
 Jonkheer          1
Name: title, dtype: int64

In [15]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr


In [16]:
train.drop('Name', axis = 1, inplace = True)

In [17]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,S,Mr
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C,Mrs
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss
3,4,1,1,female,35.0,1,0,113803,53.1,S,Mrs
4,5,0,3,male,35.0,0,0,373450,8.05,S,Mr


In [18]:
# Let's do the same with test dataframe
test['title'] = test['Name'].str.split(',', expand = True)[1].str.split('.', expand = True)[0]
test.drop('Name', axis = 1, inplace = True)

In [19]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,title
0,892,3,male,34.5,0,0,330911,7.8292,Q,Mr
1,893,3,female,47.0,1,0,363272,7.0,S,Mrs
2,894,2,male,62.0,0,0,240276,9.6875,Q,Mr
3,895,3,male,27.0,0,0,315154,8.6625,S,Mr
4,896,3,female,22.0,1,1,3101298,12.2875,S,Mrs


Done

### Age column

In [60]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,S,Mr
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C,Mrs
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss
3,4,1,1,female,35.0,1,0,113803,53.1,S,Mrs
4,5,0,3,male,35.0,0,0,373450,8.05,S,Mr


# More insight to BoxPlots 