In [4]:
file = 'https://drive.google.com/file/d/18b5oqoSnSQ7d28vnv8RMgan2tI1--uLs/view?usp=drive_link'

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [43]:
df = pd.read_csv('train.csv')

In [44]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
# make index start from 1
df.index = df.index + 1

In [15]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
# check data types and not non-null count
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 1 to 891
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [17]:
# isna(), isnull()
df.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [20]:
df.shape

(891, 12)

With handling missing values, we could drop off all rows with missing values, this method is faster but consider the data that could be lost in the process.

In [18]:
new_df = df.dropna()

In [19]:
new_df.shape

(183, 12)

In [None]:
new_df.isna().sum()

Using a statistical measure like the mean or median to fill up missing values is recommended for numeric data like missing age records.

In [21]:
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)

In [22]:
df.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [23]:
df['Cabin'].value_counts()

Unnamed: 0_level_0,count
Cabin,Unnamed: 1_level_1
B96 B98,4
G6,4
C23 C25 C27,4
C22 C26,3
F33,3
...,...
E34,1
C7,1
C54,1
E36,1


Cabin records are categorical: they are not numbers but distict categories. in this case, the mode could be suitable.

In [24]:
df['Cabin'].mode()

Unnamed: 0,Cabin
0,B96 B98
1,C23 C25 C27
2,G6


The column is multi-modal (more than one mode) in this case you may say it is 'tri-modal'.
`NOTE:`ploting a distribution will make this detail more visual.

In [None]:
df.Fare.max()

Certain columns could be dropped off, due to their volume of missing values.

In [25]:
df.drop('Cabin', axis=1, inplace=True)

In [26]:
df.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


It is important to consider certain relationships between the columns, they may give us a clue as to what is suitable to fill in the missing values with.

In [27]:
# fill up missing values: missing values in the embarked column
df[df.Embarked.isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
62,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,
830,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,


In [28]:
df['Embarked'].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
S,644
C,168
Q,77


In [32]:
df.Fare.value_counts().sort_index()

Unnamed: 0_level_0,count
Fare,Unnamed: 1_level_1
0.0000,15
4.0125,1
5.0000,1
6.2375,1
6.4375,1
...,...
227.5250,4
247.5208,2
262.3750,2
263.0000,4


Couuld passengers who paid the same amount 'Fare' may have also shared a common embarked port?Ticket numbers are also similar.

In [29]:
df[(df.Fare >= 80.0) & (df.Fare < 90.0)]['Embarked']#.dropna().mode()

Unnamed: 0,Embarked
35,C
62,
63,S
231,S
258,S
311,C
376,C
446,S
454,C
505,S


In [40]:
df[(df.Fare >= 80.0) & (df.Fare < 85.0)]['Embarked'].value_counts()

Unnamed: 0_level_0,count
Embarked,Unnamed: 1_level_1
C,5
S,3


In [41]:
df.Embarked.fillna('C', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Embarked.fillna('C', inplace=True)


In [42]:
df.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [None]:
# age
df['Age'].mean()

In [None]:
df['Age'].median()

In [None]:
median_age = df['Age'].median()
df['Age'].fillna(median_age)

Fowardfill and Backfill methods: consider the previous and subsequent values respectively and use them to fill up missing values accordingly.

In [45]:
# forward fill, backward fill
# take a sample of our age data
df_sample = df.sample(30)

In [46]:
df_sample.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,7
SibSp,0
Parch,0
Ticket,0
Fare,0


In [48]:
df_sample[df_sample['Age'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
250,251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S
126,127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q
790,791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q
300,301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q
444,445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S
304,305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S
284,285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26.0,A19,S


In [52]:
# Apply foward fill
df_sample['Age']. bfill()

Unnamed: 0,Age
427,19.0
671,31.0
435,14.0
644,0.75
132,47.0
722,34.0
188,40.0
250,4.0
126,4.0
445,4.0


In [51]:
df_sample

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
427,428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louis...",female,19.0,0,0,250655,26.0,,S
671,672,0,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0,B71,S
435,436,1,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0,B96 B98,S
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
132,133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47.0,1,0,A/5. 3337,14.5,,S
722,723,0,2,"Gillespie, Mr. William Henry",male,34.0,0,0,12233,13.0,,S
188,189,0,3,"Bourke, Mr. John",male,40.0,1,1,364849,15.5,,Q
250,251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S
126,127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q
445,446,1,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S


Additional Resources:
[Techniques to Handle Missing Data](https://www.datacamp.com/tutorial/techniques-to-handle-missing-data-values)
