### Dealing with missing data in Pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dummy_df = pd.read_csv('./datasets/dummy_data.csv')
dummy_df.head()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,1,John,25.0,160.0
1,2,Jimmy,26.0,163.0
2,3,Felicia,28.0,154.0
3,4,Sophia,,143.0
4,5,Bob,,


In [3]:
dummy_df.describe()

Unnamed: 0,Sno,Age,Height(cm)
count,9.0,6.0,7.0
mean,5.0,28.166667,154.857143
std,2.738613,2.316607,7.174691
min,1.0,25.0,143.0
25%,3.0,26.5,151.0
50%,5.0,28.5,156.0
75%,7.0,29.75,160.0
max,9.0,31.0,163.0


In [4]:
dummy_df.isnull()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,False,False,True,True
5,False,False,False,False
6,False,False,False,False
7,False,False,False,True
8,False,False,True,False


In [5]:
dummy_df.isna()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,False,False,True,True
5,False,False,False,False
6,False,False,False,False
7,False,False,False,True
8,False,False,True,False


In [6]:
dummy_df.notna()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,True,True,True,True
1,True,True,True,True
2,True,True,True,True
3,True,True,False,True
4,True,True,False,False
5,True,True,True,True
6,True,True,True,True
7,True,True,True,False
8,True,True,False,True


In [7]:
dummy_df.isna().sum()

Sno           0
Name          0
Age           3
Height(cm)    2
dtype: int64

In [8]:
dummy_df.dtypes

Sno             int64
Name           object
Age           float64
Height(cm)    float64
dtype: object

In [9]:
str_dummy_df = pd.read_csv('./datasets/dummy_str_data.csv')
str_dummy_df.head()

Unnamed: 0,Sno,Device_name,Device_description,Single-Use
0,1,Synringe,Used to inject medicine,True
1,2,Ventilator,Used to help patients breath,False
2,3,Surgical Gloves,,True
3,4,Stethescopes,,
4,5,Vials container,,


In [10]:
str_dummy_df.dtypes

Sno                    int64
Device_name           object
Device_description    object
Single-Use            object
dtype: object

In [11]:
str_dummy_df.describe()

Unnamed: 0,Sno
count,5.0
mean,3.0
std,1.581139
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


In [12]:
str_dummy_df.isnull()

Unnamed: 0,Sno,Device_name,Device_description,Single-Use
0,False,False,False,False
1,False,False,False,False
2,False,False,True,False
3,False,False,True,True
4,False,False,True,True


In [13]:
str_dummy_df.isna().sum()

Sno                   0
Device_name           0
Device_description    3
Single-Use            2
dtype: int64

In [14]:
str_dummy_df

Unnamed: 0,Sno,Device_name,Device_description,Single-Use
0,1,Synringe,Used to inject medicine,True
1,2,Ventilator,Used to help patients breath,False
2,3,Surgical Gloves,,True
3,4,Stethescopes,,
4,5,Vials container,,


In [15]:
time_df = pd.read_csv('./datasets/dummy_time.csv')
time_df.head()

Unnamed: 0.1,Unnamed: 0,Sno,Name,Age,Height(cm),birthday
0,0,1,John,25.0,160.0,1994-01-01
1,1,2,Jimmy,26.0,163.0,
2,2,3,Felicia,28.0,154.0,1995-01-01
3,3,4,Sophia,,143.0,
4,4,5,Bob,,,1994-01-01


In [16]:
time_df.dtypes

Unnamed: 0      int64
Sno             int64
Name           object
Age           float64
Height(cm)    float64
birthday       object
dtype: object

In [17]:
time_df.describe()

Unnamed: 0.1,Unnamed: 0,Sno,Age,Height(cm)
count,9.0,9.0,6.0,7.0
mean,4.0,5.0,28.166667,154.857143
std,2.738613,2.738613,2.316607,7.174691
min,0.0,1.0,25.0,143.0
25%,2.0,3.0,26.5,151.0
50%,4.0,5.0,28.5,156.0
75%,6.0,7.0,29.75,160.0
max,8.0,9.0,31.0,163.0


In [18]:
time_df.isna().sum()

Unnamed: 0    0
Sno           0
Name          0
Age           3
Height(cm)    2
birthday      3
dtype: int64

In [19]:
time_df['birthday'] = pd.to_datetime(time_df['birthday'])
time_df.head()

Unnamed: 0.1,Unnamed: 0,Sno,Name,Age,Height(cm),birthday
0,0,1,John,25.0,160.0,1994-01-01
1,1,2,Jimmy,26.0,163.0,NaT
2,2,3,Felicia,28.0,154.0,1995-01-01
3,3,4,Sophia,,143.0,NaT
4,4,5,Bob,,,1994-01-01


missing values in datetime are replaced as `NaT`

In [20]:
time_df.dtypes

Unnamed: 0             int64
Sno                    int64
Name                  object
Age                  float64
Height(cm)           float64
birthday      datetime64[ns]
dtype: object

### Dealing with missing values

* ignoring the rows with missing values
* imputing the missing values ( mean, median or mode)

### Ignoring the row with missing data

In [21]:
dummy_df

Unnamed: 0,Sno,Name,Age,Height(cm)
0,1,John,25.0,160.0
1,2,Jimmy,26.0,163.0
2,3,Felicia,28.0,154.0
3,4,Sophia,,143.0
4,5,Bob,,
5,6,Billy,30.0,156.0
6,7,Kate,31.0,160.0
7,8,Will,29.0,
8,9,Scott,,148.0


In [22]:
dummy_df.isna()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,False,False,True,True
5,False,False,False,False
6,False,False,False,False
7,False,False,False,True
8,False,False,True,False


### dropping rows not ideal as most of the data from the df is gone

In [23]:
dummy_df.dropna()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,1,John,25.0,160.0
1,2,Jimmy,26.0,163.0
2,3,Felicia,28.0,154.0
5,6,Billy,30.0,156.0
6,7,Kate,31.0,160.0


In [24]:
dummy_majority_df = pd.read_csv('./datasets/dummy_missing_majority.csv')
dummy_majority_df

Unnamed: 0,Sno,Name,Age,Height(cm),Marks(100),Country,City
0,1,John,25.0,160.0,80.0,USA,New York
1,2,Jimmy,26.0,163.0,,UK,London
2,3,Felicia,28.0,154.0,,USA,Miami
3,4,Sophia,,143.0,,,
4,5,Bob,,,,,
5,6,Billy,30.0,156.0,,France,Paris
6,7,Kate,31.0,160.0,,Italy,Rome
7,8,Will,29.0,,,Russia,Moscow
8,9,Scott,,148.0,,,


### most of the data is gone

In [25]:
dummy_majority_df.dropna()

Unnamed: 0,Sno,Name,Age,Height(cm),Marks(100),Country,City
0,1,John,25.0,160.0,80.0,USA,New York


In [26]:
dummy_majority_df.dropna(thresh=3)

Unnamed: 0,Sno,Name,Age,Height(cm),Marks(100),Country,City
0,1,John,25.0,160.0,80.0,USA,New York
1,2,Jimmy,26.0,163.0,,UK,London
2,3,Felicia,28.0,154.0,,USA,Miami
3,4,Sophia,,143.0,,,
5,6,Billy,30.0,156.0,,France,Paris
6,7,Kate,31.0,160.0,,Italy,Rome
7,8,Will,29.0,,,Russia,Moscow
8,9,Scott,,148.0,,,


In [27]:
dummy_majority_df.dropna(axis=1, thresh=3)

Unnamed: 0,Sno,Name,Age,Height(cm),Country,City
0,1,John,25.0,160.0,USA,New York
1,2,Jimmy,26.0,163.0,UK,London
2,3,Felicia,28.0,154.0,USA,Miami
3,4,Sophia,,143.0,,
4,5,Bob,,,,
5,6,Billy,30.0,156.0,France,Paris
6,7,Kate,31.0,160.0,Italy,Rome
7,8,Will,29.0,,Russia,Moscow
8,9,Scott,,148.0,,


In [28]:
dummy_majority_df

Unnamed: 0,Sno,Name,Age,Height(cm),Marks(100),Country,City
0,1,John,25.0,160.0,80.0,USA,New York
1,2,Jimmy,26.0,163.0,,UK,London
2,3,Felicia,28.0,154.0,,USA,Miami
3,4,Sophia,,143.0,,,
4,5,Bob,,,,,
5,6,Billy,30.0,156.0,,France,Paris
6,7,Kate,31.0,160.0,,Italy,Rome
7,8,Will,29.0,,,Russia,Moscow
8,9,Scott,,148.0,,,


In [29]:
dummy_majority_df.dropna(axis=1)

Unnamed: 0,Sno,Name
0,1,John
1,2,Jimmy
2,3,Felicia
3,4,Sophia
4,5,Bob
5,6,Billy
6,7,Kate
7,8,Will
8,9,Scott


### dropping columns with 40% empty values

In [31]:
dummy_majority_df.dropna(axis=1, thresh=int(0.4*len(dummy_majority_df)))

Unnamed: 0,Sno,Name,Age,Height(cm),Country,City
0,1,John,25.0,160.0,USA,New York
1,2,Jimmy,26.0,163.0,UK,London
2,3,Felicia,28.0,154.0,USA,Miami
3,4,Sophia,,143.0,,
4,5,Bob,,,,
5,6,Billy,30.0,156.0,France,Paris
6,7,Kate,31.0,160.0,Italy,Rome
7,8,Will,29.0,,Russia,Moscow
8,9,Scott,,148.0,,


## 2. Imputing Values

* Filling with generic values

In [33]:
dummy_df

Unnamed: 0,Sno,Name,Age,Height(cm)
0,1,John,25.0,160.0
1,2,Jimmy,26.0,163.0
2,3,Felicia,28.0,154.0
3,4,Sophia,,143.0
4,5,Bob,,
5,6,Billy,30.0,156.0
6,7,Kate,31.0,160.0
7,8,Will,29.0,
8,9,Scott,,148.0


In [35]:
# filling the missing values with `-1` below
dummy_df.fillna(-1)

Unnamed: 0,Sno,Name,Age,Height(cm)
0,1,John,25.0,160.0
1,2,Jimmy,26.0,163.0
2,3,Felicia,28.0,154.0
3,4,Sophia,-1.0,143.0
4,5,Bob,-1.0,-1.0
5,6,Billy,30.0,156.0
6,7,Kate,31.0,160.0
7,8,Will,29.0,-1.0
8,9,Scott,-1.0,148.0


In [37]:
# Filling with the back values
dummy_df.bfill()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,1,John,25.0,160.0
1,2,Jimmy,26.0,163.0
2,3,Felicia,28.0,154.0
3,4,Sophia,30.0,143.0
4,5,Bob,30.0,156.0
5,6,Billy,30.0,156.0
6,7,Kate,31.0,160.0
7,8,Will,29.0,148.0
8,9,Scott,,148.0


In [39]:
# forward fill
dummy_df.ffill()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,1,John,25.0,160.0
1,2,Jimmy,26.0,163.0
2,3,Felicia,28.0,154.0
3,4,Sophia,28.0,143.0
4,5,Bob,28.0,143.0
5,6,Billy,30.0,156.0
6,7,Kate,31.0,160.0
7,8,Will,29.0,160.0
8,9,Scott,29.0,148.0


### Filling with central tendencies

In [40]:
mean_age = dummy_df['Age'].mean()
mean_age

28.166666666666668

In [41]:
mean_height = dummy_df['Height(cm)'].mean()
mean_height

154.85714285714286

In [44]:
values = {'Age': mean_age, 'Height(cm)': mean_height}
dummy_df.fillna(value=values)

Unnamed: 0,Sno,Name,Age,Height(cm)
0,1,John,25.0,160.0
1,2,Jimmy,26.0,163.0
2,3,Felicia,28.0,154.0
3,4,Sophia,28.166667,143.0
4,5,Bob,28.166667,154.857143
5,6,Billy,30.0,156.0
6,7,Kate,31.0,160.0
7,8,Will,29.0,154.857143
8,9,Scott,28.166667,148.0


## Imputing values based on condition

In [45]:
weight_df = pd.read_csv('./datasets/dummy_age_weight.csv')
weight_df

Unnamed: 0,Gender,Weight(kg)
0,Male,70.0
1,Female,55.0
2,Male,65.0
3,Female,
4,Female,60.0
5,Male,
6,Female,52.0
7,Female,53.0
8,Male,85.0
9,Male,75.0


In [49]:
weight_df.groupby('Gender')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f3948c47350>

### returns tuple values

In [50]:
x = weight_df.groupby('Gender')
for i in x:
    print(i)

('Female',     Gender  Weight(kg)
1   Female        55.0
3   Female         NaN
4   Female        60.0
6   Female        52.0
7   Female        53.0
11  Female        68.0)
('Male',    Gender  Weight(kg)
0    Male        70.0
2    Male        65.0
5    Male         NaN
8    Male        85.0
9    Male        75.0
10   Male         NaN)


In [52]:
 weight_df.groupby('Gender').transform(lambda x:x.fillna(x.mean()))

Unnamed: 0,Weight(kg)
0,70.0
1,55.0
2,65.0
3,57.6
4,60.0
5,73.75
6,52.0
7,53.0
8,85.0
9,75.0


In [54]:
male_mean = (70+65+85+75)/4
male_mean

73.75

In [56]:
female_mean = (55+60+52+53+68)/5
female_mean

57.6