In [191]:
import pandas as pd

In [192]:
df = pd.read_csv('titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# EDA (Exploratory Data Analysis)

In [193]:
df.info()
# cabin has a lot of missing values, so we will drop it

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [194]:
df.Name.value_counts()
# use the name to find the title and use it as new feature
# the title is used to determine the age

Name
Braund, Mr. Owen Harris                     1
Boulos, Mr. Hanna                           1
Frolicher-Stehli, Mr. Maxmillian            1
Gilinski, Mr. Eliezer                       1
Murdlin, Mr. Joseph                         1
                                           ..
Kelly, Miss. Anna Katherine "Annie Kate"    1
McCoy, Mr. Bernard                          1
Johnson, Mr. William Cahoone Jr             1
Keane, Miss. Nora A                         1
Dooley, Mr. Patrick                         1
Name: count, Length: 891, dtype: int64

In [195]:
df.Sex.value_counts()
# change to 0 and 1

Sex
male      577
female    314
Name: count, dtype: int64

In [196]:
df.Ticket.value_counts()
# not useful
# but we can use the length of the ticket to find the ticket class
# because it has 891 entries and i think it gives me a msg from that, so we can use it as a feature

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

In [197]:
df.Cabin.value_counts()

Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

In [198]:
df.Embarked.value_counts()
# change to numeric

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [199]:
df.describe()
# age has missing values, we can use the mean to fill it

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Data Cleaning

In [200]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [201]:
# drop the cabin column, because it has a lot of missing values as i said before
df.drop('Cabin', axis=1, inplace=True)

In [202]:
# use the name to find the title and use it as new feature
# the title is used to determine the age
df['Title'] = df.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
df.Title.value_counts()


Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64

In [203]:
# we can see that there are a lot of titles, so we will group them, and the titles with less than 10 will be grouped as rare

rare_titles =  df.Title.value_counts().loc[lambda x: x < 10]
df['Title'] = df['Title'].replace(rare_titles.index, 'Rare')

In [204]:
df.Title.value_counts()

Title
Mr        517
Miss      182
Mrs       125
Master     40
Rare       27
Name: count, dtype: int64

In [205]:
# change the title to numeric
title_mask = {
    'Mr': 0,
    'Miss': 1,
    'Mrs': 2,
    'Master': 3,
    'Rare': 4
}
df['Title'] = df['Title'].map(title_mask)

In [206]:
df.Title.value_counts()

Title
0    517
1    182
2    125
3     40
4     27
Name: count, dtype: int64

In [207]:
# change sex to numeric

sex_mask = {
    'male':0,
    'female':1
}

df["Sex"] = df.Sex.map(sex_mask)

In [208]:
df.Sex.value_counts()

Sex
0    577
1    314
Name: count, dtype: int64

In [209]:
ticket_mask = {t_name:t_val for t_name,t_val in df.Ticket.value_counts().items()}
df.Ticket.value_counts()

Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64

In [210]:
df['Ticket'] = df.Ticket.map(ticket_mask)

In [211]:
# 547 adalah total orang yang tidak mempunyai tujuan yang sama
# 188 adalah total orang yang setiap orangnya mempunyai tujuan yang sama dengan 1 orang lainnya
#  63 adalah total orang yang setiap orangnya mempunyai tujuan yang sama dengan 2 orang lainnya
# dst
df.Ticket.value_counts()
# sebenarnya fitur kolom ini tidak terlalu diperlukan

Ticket
1    547
2    188
3     63
4     44
7     21
6     18
5     10
Name: count, dtype: int64

In [212]:
embarked_mask = {
    'S': 0,
    'C': 1,
    'Q': 2,
}

df['Embarked'] = df.Embarked.map(embarked_mask)

In [213]:
df.Embarked.value_counts()

Embarked
0.0    644
1.0    168
2.0     77
Name: count, dtype: int64

In [214]:
df.Age.fillna(df.Age.mean(), inplace=True)

In [216]:
df.Embarked.fillna(df.Embarked.mode(), inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    int64  
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    float64
 11  Title        891 non-null    int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 83.7+ KB
