1. https://towardsdatascience.com/7-ways-to-handle-missing-values-in-machine-learning-1a6326adf79e
2. https://towardsdatascience.com/how-to-handle-missing-values-in-python-23407781b2b0

In [38]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [39]:
df=sns.load_dataset('titanic')

### Identify missing values in each column

In [40]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [41]:
round(df.isna().mean(),2) ## age has 22 % and deck has 77% missing values

survived       0.00
pclass         0.00
sex            0.00
age            0.20
sibsp          0.00
parch          0.00
fare           0.00
embarked       0.00
class          0.00
who            0.00
adult_male     0.00
deck           0.77
embark_town    0.00
alive          0.00
alone          0.00
dtype: float64

### Delete the rows has null values

In [42]:
df.dropna()  ## it will drop all the null values

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


###  Drop the columns has more than 70% null values

In [43]:
df.dropna(axis=1,thresh=int(df.shape[0]*0.3))  # thresh ==> how many non null values allowed

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


### Replace with mean, mode, median

In [44]:
df['age'].fillna(value=np.mean(df.age)).tail()

886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, dtype: float64

In [45]:
df['age'].fillna(np.median(df.age.dropna())).tail()

886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: age, dtype: float64

In [46]:
import statistics
df['age'].fillna(statistics.mode(df.age)).tail()

886    27.0
887    19.0
888    24.0
889    26.0
890    32.0
Name: age, dtype: float64

### Fill missing values by interpolation

In [47]:
df.age.interpolate(limit=None).tail()  ## interpolate by linear

886    27.0
887    19.0
888    22.5
889    26.0
890    32.0
Name: age, dtype: float64

In [48]:
pd.date_range(pd.to_datetime('01/01/2020'),pd.to_datetime('28/01/2020'))
new=df.head(28)
new.index=pd.date_range(pd.to_datetime('01/01/2020'),pd.to_datetime('28/01/2020'))

In [49]:
new.age.interpolate(method='time').tail() # interpolate by time

2020-01-24    28.0
2020-01-25     8.0
2020-01-26    38.0
2020-01-27    28.5
2020-01-28    19.0
Freq: D, Name: age, dtype: float64

### Replace with most frequent category

In [50]:
df['deck'].fillna(df['deck'].value_counts().index[0]).head()

0    C
1    C
2    C
3    C
4    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [51]:
df.apply(lambda x:x.fillna(x.value_counts().index[0])).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,C,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,C,Southampton,no,True


### Add another category or arbitrary value

In [86]:
df.age.fillna(999).head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [87]:
df.deck.astype('string').fillna('Z').head()

0    Z
1    C
2    Z
3    C
4    Z
Name: deck, dtype: string

### Add a column to capture the NA values

In [91]:
df.age.isna().astype(int)

0      0
1      0
2      0
3      0
4      0
      ..
886    0
887    0
888    1
889    0
890    0
Name: age, Length: 891, dtype: int32

### Random sampling

In [231]:
missing=pd.Series([np.nan]).astype('string')[0]

df.deck.astype('string').apply(lambda x: np.random.choice(df.deck.dropna()) if x is missing else x)

0      B
1      C
2      C
3      C
4      E
      ..
886    B
887    B
888    C
889    C
890    F
Name: deck, Length: 891, dtype: object

###  MICE

https://towardsdatascience.com/how-to-handle-missing-values-in-python-23407781b2b0

In [243]:
Xtrain=df.drop('survived',axis=1).select_dtypes(np.number)  ## it works only on numeric type data and we 
                                                            ## need to convert it into metrics 

In [250]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(Xtrain)
new=pd.DataFrame(imp.transform(Xtrain),columns=Xtrain.columns)
new.head()

Unnamed: 0,pclass,age,sibsp,parch,fare
0,3.0,22.0,1.0,0.0,7.25
1,1.0,38.0,1.0,0.0,71.2833
2,3.0,26.0,0.0,0.0,7.925
3,1.0,35.0,1.0,0.0,53.1
4,3.0,35.0,0.0,0.0,8.05


### Knn imputer

In [261]:
import numpy as np
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2) #define the k nearest neighbors   
pd.DataFrame(imputer.fit_transform(df.select_dtypes(include=np.number)),columns=df.select_dtypes(include=np.number).columns)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0.0,3.0,22.0,1.0,0.0,7.2500
1,1.0,1.0,38.0,1.0,0.0,71.2833
2,1.0,3.0,26.0,0.0,0.0,7.9250
3,1.0,1.0,35.0,1.0,0.0,53.1000
4,0.0,3.0,35.0,0.0,0.0,8.0500
...,...,...,...,...,...,...
886,0.0,2.0,27.0,0.0,0.0,13.0000
887,1.0,1.0,19.0,0.0,0.0,30.0000
888,0.0,3.0,33.0,1.0,2.0,23.4500
889,1.0,1.0,26.0,0.0,0.0,30.0000


###  Use those Algorithms which support the Missing values 

1. random_forest
2. knn
3. naive bayes