### Import Pandas and initialize the DataFrame

In [63]:
import pandas as pd
import numpy as np

table = {
    'name': ['James', 'Barra', 'Sarah', 'Bill', 'Peter', 'Chloe', 'Ben', 'Anna', 'Anna', np.nan],
    'gender': ['M', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'F', np.nan],
    'age': [27, np.nan, 34, 23, 27, 32, 34, 23, 23, np.nan],
    'weight': [75.1, np.nan, 63.5, 87.2, 75.1, 98.3, 63.5, 87.2, 87.2, np.nan],
    'height': ['Short', 'Short', 'Medium', 'Tall', 'Short', 'Short', 'Medium', 'Tall', 'Tall', np.nan],
    'income': [50, np.nan, 30, 35, np.nan, np.nan, 50, 40, 40, np.nan],
}

df = pd.DataFrame(table)
df

Unnamed: 0,name,gender,age,weight,height,income
0,James,M,27.0,75.1,Short,50.0
1,Barra,M,,,Short,
2,Sarah,F,34.0,63.5,Medium,30.0
3,Bill,M,23.0,87.2,Tall,35.0
4,Peter,M,27.0,75.1,Short,
5,Chloe,F,32.0,98.3,Short,
6,Ben,M,34.0,63.5,Medium,50.0
7,Anna,F,23.0,87.2,Tall,40.0
8,Anna,F,23.0,87.2,Tall,40.0
9,,,,,,


### Check missing/null values, and their count

In [64]:
df.isnull()

Unnamed: 0,name,gender,age,weight,height,income
0,False,False,False,False,False,False
1,False,False,True,True,False,True
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,True
5,False,False,False,False,False,True
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,True,True,True,True,True,True


In [65]:
df.isnull().sum()

name      1
gender    1
age       2
weight    2
height    1
income    4
dtype: int64

### Drop any rows containing missing data

In [66]:
df.dropna()

Unnamed: 0,name,gender,age,weight,height,income
0,James,M,27.0,75.1,Short,50.0
2,Sarah,F,34.0,63.5,Medium,30.0
3,Bill,M,23.0,87.2,Tall,35.0
6,Ben,M,34.0,63.5,Medium,50.0
7,Anna,F,23.0,87.2,Tall,40.0
8,Anna,F,23.0,87.2,Tall,40.0


### Drop rows whose every column is NaN

In [67]:
df.dropna(how='all')

Unnamed: 0,name,gender,age,weight,height,income
0,James,M,27.0,75.1,Short,50.0
1,Barra,M,,,Short,
2,Sarah,F,34.0,63.5,Medium,30.0
3,Bill,M,23.0,87.2,Tall,35.0
4,Peter,M,27.0,75.1,Short,
5,Chloe,F,32.0,98.3,Short,
6,Ben,M,34.0,63.5,Medium,50.0
7,Anna,F,23.0,87.2,Tall,40.0
8,Anna,F,23.0,87.2,Tall,40.0


### Drop rows that have MORE than 2 NaN

In [68]:
df.dropna(thresh=(len(df.columns) - 2))

Unnamed: 0,name,gender,age,weight,height,income
0,James,M,27.0,75.1,Short,50.0
2,Sarah,F,34.0,63.5,Medium,30.0
3,Bill,M,23.0,87.2,Tall,35.0
4,Peter,M,27.0,75.1,Short,
5,Chloe,F,32.0,98.3,Short,
6,Ben,M,34.0,63.5,Medium,50.0
7,Anna,F,23.0,87.2,Tall,40.0
8,Anna,F,23.0,87.2,Tall,40.0


### Drop rows that have NaN in a specific column

In [69]:
df.dropna(subset=['weight'])

Unnamed: 0,name,gender,age,weight,height,income
0,James,M,27.0,75.1,Short,50.0
2,Sarah,F,34.0,63.5,Medium,30.0
3,Bill,M,23.0,87.2,Tall,35.0
4,Peter,M,27.0,75.1,Short,
5,Chloe,F,32.0,98.3,Short,
6,Ben,M,34.0,63.5,Medium,50.0
7,Anna,F,23.0,87.2,Tall,40.0
8,Anna,F,23.0,87.2,Tall,40.0


### Replace NaN with a constant

In [70]:
df.fillna(0)

Unnamed: 0,name,gender,age,weight,height,income
0,James,M,27.0,75.1,Short,50.0
1,Barra,M,0.0,0.0,Short,0.0
2,Sarah,F,34.0,63.5,Medium,30.0
3,Bill,M,23.0,87.2,Tall,35.0
4,Peter,M,27.0,75.1,Short,0.0
5,Chloe,F,32.0,98.3,Short,0.0
6,Ben,M,34.0,63.5,Medium,50.0
7,Anna,F,23.0,87.2,Tall,40.0
8,Anna,F,23.0,87.2,Tall,40.0
9,0,0,0.0,0.0,0,0.0


### Impute mean, mode, and median

In [71]:
df['age'].fillna(df['age'].mean())

0    27.000
1    27.875
2    34.000
3    23.000
4    27.000
5    32.000
6    34.000
7    23.000
8    23.000
9    27.875
Name: age, dtype: float64

In [72]:
df['weight'].fillna(df['weight'].mode())

0    75.1
1     NaN
2    63.5
3    87.2
4    75.1
5    98.3
6    63.5
7    87.2
8    87.2
9     NaN
Name: weight, dtype: float64

In [73]:
df['income'].fillna(df['income'].median())

0    50.0
1    40.0
2    30.0
3    35.0
4    40.0
5    40.0
6    50.0
7    40.0
8    40.0
9    40.0
Name: income, dtype: float64

### Check for duplicates and drop them

In [76]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
9    False
dtype: bool

In [78]:
df.drop_duplicates()

Unnamed: 0,name,gender,age,weight,height,income
0,James,M,27.0,75.1,Short,50.0
1,Barra,M,,,Short,
2,Sarah,F,34.0,63.5,Medium,30.0
3,Bill,M,23.0,87.2,Tall,35.0
4,Peter,M,27.0,75.1,Short,
5,Chloe,F,32.0,98.3,Short,
6,Ben,M,34.0,63.5,Medium,50.0
7,Anna,F,23.0,87.2,Tall,40.0
9,,,,,,
