In [34]:
import pandas as pd
import numpy as np

When our data contains NaN values, we have a few options:

- Remove them
- Leave them
- Replace them with something else

In [35]:
path = '../../pandas-workout-data/data/titanic3.xls'

In [36]:
df = pd.read_excel(io=path)
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


Unlike many of the exercises in this book, this one has no obvious right or wrong answer. There are, of course, techniques for calculating values—such as the mean and mode for a column—but I hope you’ll consider not just how to make such calculations but also why you would do so and when it’s most appropriate.

Checking for the null values in the DataFrame


In [37]:
len(df.index)

1309

In [38]:
df.isnull()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,False,False,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,False,True,True,False
3,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,False,False,False,False,False,False,False,False,False,True,False,True,False,True
1305,False,False,False,False,True,False,False,False,False,True,False,True,True,True
1306,False,False,False,False,False,False,False,False,False,True,False,True,False,True
1307,False,False,False,False,False,False,False,False,False,True,False,True,True,True


In [39]:
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [40]:
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [41]:
df.columns[df.isnull().sum() > 0]

Index(['age', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'], dtype='object')

Notice that the column names are stored in an Index object, which works similarly to a series object.

Deciding what to do with each NaN-containing column depends on various factors, including the type of data the column contains. Another factor is how many rows have null values. Two cases, fare and embarked, have one and two null rows, respectively. Given that our data frame has more than 1,300 rows, missing 1 or 2 of them won’t make a significant difference. So, I suggest that we remove those rows from the data frame

In [42]:
df = df.dropna(subset=['fare', 'embarked'])

In [43]:
len(df.index)

1306

In [44]:
df.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


Using the mean age has some advantages: it won’t affect the mean age, although it will reduce the standard deviation. It’s not necessarily wrong, even though we know it’s not totally right. In another context, such as sales of a particular product in an online store, replacing missing values with the mean can sometimes work, especially if we have similar products with a similar sales history.

In any event, we can replace NaN in the age column as follows

In [45]:
mean_age= df['age'].mean() # Pandas ignores NaN values by default, which means this calculation is based on the non-null numeric values in that column

In [46]:
df.loc[:,'age'] = df['age'].fillna(mean_age)

In [47]:
df.columns[df.isnull().sum() > 0] # age already doesn't appear

Index(['cabin', 'boat', 'body', 'home.dest'], dtype='object')

In the end, we’ve replaced any NaN values in df['age'] with the mean of the existing values.

In [48]:
df.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


Finally, we want to set the home.dest column similarly to what we did with the age column—but instead of using the mean, we’ll use the mode (i.e., the most common value). We’ll do this for two reasons: first, we can only calculate the mean from a numeric value, and the destination is a categorical/textual value. Second, given no other information, we may be able to assume that a passenger is going where most others are going. We may be wrong, but this is the least wrong choice we can make

In [49]:
mean_homedest = df['home.dest'].mode()[0] # Accesses the value associated with index-label x
mean_homedest

'New York, NY'

In [50]:
df.loc[:,'home.dest'] = df['home.dest'].fillna(mean_homedest)

In [51]:
df.columns[df.isnull().sum() > 0] # home.dest already doesn't appear

Index(['cabin', 'boat', 'body'], dtype='object')

Beyond the exercise

In these tasks, we will do something I mentioned earlier: replace NaN values in the home.dest column with the most common value from that person’s embarked column. This will take several steps:

1. Create a series (most_common_destinations) in which the index contains the unique values from the embarked column and the values are the most common destination for each value of embarked.

In [52]:
df = pd.read_excel(io=path)
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [53]:
df['embarked'].value_counts()

embarked
S    914
C    270
Q    123
Name: count, dtype: int64

In [54]:
unique_values_embarked_column = df['embarked'].value_counts().index.to_numpy()
unique_values_embarked_column

array(['S', 'C', 'Q'], dtype=object)

In [55]:
values_most_common_destination = []

In [56]:
for embarked_value in unique_values_embarked_column:
    mask = df['embarked'] == embarked_value
    index_name = df.loc[:, 'home.dest'][mask].value_counts().head(1).index[0] # getting the index label
    values_most_common_destination.append(index_name)

In [57]:
values_most_common_destination

['New York, NY', 'New York, NY', 'Ireland Chicago, IL']

In [58]:
new_serie = pd.Series(data=values_most_common_destination, index=unique_values_embarked_column)
new_serie

S           New York, NY
C           New York, NY
Q    Ireland Chicago, IL
dtype: object

Replace NaN values in the home.dest column with values from embarked. (Because values in embarked and home.dest are distinct, this is an OK middle step.)

In [59]:
df['home.dest'].isnull().sum()

np.int64(564)

In [60]:
# this gives me: FutureWarning: ChainedAssignmentError and SettingWithCopyWarning:
#df.loc[:,'home.dest'][df['embarked'] == unique_values_embarked_column[0]] = df.loc[:,'home.dest'][df['embarked'] == unique_values_embarked_column[0]].replace(to_replace=np.nan, value=unique_values_embarked_column[0])

In [61]:
mask = df['embarked'] == unique_values_embarked_column[0]
mask

0        True
1        True
2        True
3        True
4        True
        ...  
1304    False
1305    False
1306    False
1307    False
1308     True
Name: embarked, Length: 1309, dtype: bool

In [62]:
# df.loc[mask, 'home.dest'] = df.loc[mask, 'home.dest'].replace(
#     to_replace=np.nan,
#     value=unique_values_embarked_column[0])

In [63]:
# This is a more efficcent tahn the code above|
#df.loc[mask, 'home.dest'] = df.loc[mask, 'home.dest'].fillna(unique_values_embarked_column[0]) 

In [None]:
df['home.dest'].isnull().sum() 

np.int64(564)

In [65]:
for index_value in new_serie.index:
    mask = df['embarked'] == index_value
    df.loc[mask, 'home.dest'] = df.loc[mask, 'home.dest'].fillna(new_serie[index_value])

In [66]:
df['home.dest'].isnull().sum() #185

np.int64(1)

In [67]:
mask[df['home.dest'].isnull()]

168    False
Name: embarked, dtype: bool

In [69]:
df.iloc[168] # embarked in this register is NaN that's why we couldn't assign a value in home.dest

pclass                         1
survived                       1
name         Icard, Miss. Amelie
sex                       female
age                         38.0
sibsp                          0
parch                          0
ticket                    113572
fare                        80.0
cabin                        B28
embarked                     NaN
boat                           6
body                         NaN
home.dest                    NaN
Name: 168, dtype: object

In [72]:
df['embarked'].unique() # This method only is available in Series

array(['S', 'C', nan, 'Q'], dtype=object)