# Missing Data

In [2]:
import numpy as np
import pandas as pd

### `np.nan()` represents missing data.

In [5]:
df = pd.DataFrame({'A': [1.0, 2.0, np.nan],
                 'B': [4.0, np.nan, np.nan],
                 'C': [7.0, 8.0, 9.0]})
df

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,,8.0
2,,,9.0


### Drop rows or columns with missing data.

In [6]:
# Drop rows with any missing data.
df.dropna()

Unnamed: 0,A,B,C
0,1.0,4.0,7.0


In [9]:
# Drop columns with any missing data.
df.dropna(axis=1)

Unnamed: 0,C
0,7.0
1,8.0
2,9.0


In [10]:
# Drop rows with two or more missing data points.
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,,8.0


In [11]:
# Drop columns with two or more missing data points.
df.dropna(thresh=2, axis=1)

Unnamed: 0,A,C
0,1.0,7.0
1,2.0,8.0
2,,9.0


### Fill in missing values.

pandas offers a variety of ways to fill in missing values instead of chucking them out. Which is the best one for a given situation is a philosophical problem not addressed here.

In [12]:
df

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,,8.0
2,,,9.0


In [13]:
# Replace all NaN with FNORD.
df.fillna(value='FNORD')

Unnamed: 0,A,B,C
0,1,4,7.0
1,2,FNORD,8.0
2,FNORD,FNORD,9.0


In [15]:
# Replace NaN with the mean of the column.
# The df.mean() function does not NaN values in n.
df.fillna(value=df.mean())

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,4.0,8.0
2,1.5,4.0,9.0


In [20]:
# Do the above, but only in one column.
df['A'].fillna(df['A'].mean(), inplace=True)
df

Unnamed: 0,A,B,C
0,1.0,4.0,7.0
1,2.0,,8.0
2,1.5,,9.0
