# Missing Data

In [1]:
import pandas as pd
import seaborn

In [2]:
data = seaborn.load_dataset('titanic')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [4]:
subset = data[['pclass', 'age', 'fare', 'deck']].copy()

### Omitting missing data
The default is to drop rows with any missing value, or you can specify how='all' to drop rows where all values missing

In [7]:
full_obs = subset.dropna()
full_obs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184 entries, 1 to 889
Data columns (total 4 columns):
pclass    184 non-null int64
age       184 non-null float64
fare      184 non-null float64
deck      184 non-null category
dtypes: category(1), float64(2), int64(1)
memory usage: 6.3 KB


### Filling missing values

In [8]:
median = subset.age.median()
subset['age'] = subset.age.fillna(median)

In [9]:
data.age.mean()

29.69911764705882

In [10]:
subset.age.mean()

29.36158249158249

### Other methods of filling missing data

In [11]:
ages = data['age'].copy()

In [14]:
ages.fillna(method='bfill').head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5    54.0
6    54.0
7     2.0
8    27.0
9    14.0
Name: age, dtype: float64

### You may also be interested in mask that detect null or non-null values

In [19]:
mask = data.age.notnull()
mask.head()

0    True
1    True
2    True
3    True
4    True
Name: age, dtype: bool

In [20]:
mask.value_counts()

True     714
False    177
Name: age, dtype: int64

In [21]:
data[~mask]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
29,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
31,1,1,female,,1,0,146.5208,C,First,woman,False,B,Cherbourg,yes,False
32,1,3,female,,0,0,7.7500,Q,Third,woman,False,,Queenstown,yes,True
36,1,3,male,,0,0,7.2292,C,Third,man,True,,Cherbourg,yes,True
42,0,3,male,,0,0,7.8958,C,Third,man,True,,Cherbourg,no,True
