# Missing Data

In [38]:
import numpy as np
import pandas as pd

In [39]:
data = {
    'A' : [1,2, 3, 4, 5],
    'B' : [1, 2, 3, 4, 5],
    'C' : [1,2,3,np.nan, np.nan],
    'D' : [1,np.nan, np.nan, np.nan, 5]
}

In [40]:
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,
2,3,3,3.0,
3,4,4,,
4,5,5,,5.0


In [41]:
df.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,False,False,False,True
3,False,False,True,True
4,False,False,True,False


In [42]:
df.isna().count()

A    5
B    5
C    5
D    5
dtype: int64

In [43]:
df.isna().sum()

A    0
B    0
C    2
D    3
dtype: int64

In [44]:
df.isna().any()

A    False
B    False
C     True
D     True
dtype: bool

**Removing Missing Data**

In [45]:
df

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,
2,3,3,3.0,
3,4,4,,
4,5,5,,5.0


In [46]:
# When you are removing missing data, it works on the basis of rows by default
df.dropna()  # Drops all rows that have any NaN values

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0


In [47]:
df

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,
2,3,3,3.0,
3,4,4,,
4,5,5,,5.0


In [51]:
df.dropna(thresh=3) # Drops all rows that have less than 3 non-NaN values

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,
2,3,3,3.0,
4,5,5,,5.0


**Filling The Missing Data**

In [52]:
df

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,
2,3,3,3.0,
3,4,4,,
4,5,5,,5.0


In [53]:
df.fillna(0)  # Fills all NaN values with 0

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,0.0
2,3,3,3.0,0.0
3,4,4,0.0,0.0
4,5,5,0.0,5.0


*If I want to apply these I Have to write Implace=True*

In [55]:
df

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,
2,3,3,3.0,
3,4,4,,
4,5,5,,5.0


In [57]:
values = {
    'A' : 0,
    'B' : 100,
    'C' : 300,
    'D' : 400
}
df.fillna(value=values)  # Fills NaN values with different values for each column

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,400.0
2,3,3,3.0,400.0
3,4,4,300.0,400.0
4,5,5,300.0,5.0


In [58]:
df

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,
2,3,3,3.0,
3,4,4,,
4,5,5,,5.0


In [59]:
df.fillna(df.mean())  # Fills NaN values with the mean of each column

Unnamed: 0,A,B,C,D
0,1,1,1.0,1.0
1,2,2,2.0,3.0
2,3,3,3.0,3.0
3,4,4,2.0,3.0
4,5,5,2.0,5.0
