### Dealing with missing data in Pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dummy_df = pd.read_csv('./datasets/dummy_data.csv')
dummy_df.head()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,1,John,25.0,160.0
1,2,Jimmy,26.0,163.0
2,3,Felicia,28.0,154.0
3,4,Sophia,,143.0
4,5,Bob,,


In [3]:
dummy_df.describe()

Unnamed: 0,Sno,Age,Height(cm)
count,9.0,6.0,7.0
mean,5.0,28.166667,154.857143
std,2.738613,2.316607,7.174691
min,1.0,25.0,143.0
25%,3.0,26.5,151.0
50%,5.0,28.5,156.0
75%,7.0,29.75,160.0
max,9.0,31.0,163.0


In [23]:
dummy_df.isnull()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,False,False,True,True
5,False,False,False,False
6,False,False,False,False
7,False,False,False,True
8,False,False,True,False


In [22]:
dummy_df.isna()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,False,False,True,True
5,False,False,False,False
6,False,False,False,False
7,False,False,False,True
8,False,False,True,False


In [24]:
dummy_df.notna()

Unnamed: 0,Sno,Name,Age,Height(cm)
0,True,True,True,True
1,True,True,True,True
2,True,True,True,True
3,True,True,False,True
4,True,True,False,False
5,True,True,True,True
6,True,True,True,True
7,True,True,True,False
8,True,True,False,True


In [21]:
dummy_df.isna().sum()

Sno           0
Name          0
Age           3
Height(cm)    2
dtype: int64

In [5]:
dummy_df.dtypes

Sno             int64
Name           object
Age           float64
Height(cm)    float64
dtype: object

In [6]:
str_dummy_df = pd.read_csv('./datasets/dummy_str_data.csv')
str_dummy_df.head()

Unnamed: 0,Sno,Device_name,Device_description,Single-Use
0,1,Synringe,Used to inject medicine,True
1,2,Ventilator,Used to help patients breath,False
2,3,Surgical Gloves,,True
3,4,Stethescopes,,
4,5,Vials container,,


In [7]:
str_dummy_df.dtypes

Sno                    int64
Device_name           object
Device_description    object
Single-Use            object
dtype: object

In [8]:
str_dummy_df.describe()

Unnamed: 0,Sno
count,5.0
mean,3.0
std,1.581139
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


In [9]:
str_dummy_df.isnull()

Unnamed: 0,Sno,Device_name,Device_description,Single-Use
0,False,False,False,False
1,False,False,False,False
2,False,False,True,False
3,False,False,True,True
4,False,False,True,True


In [14]:
str_dummy_df.isna().sum()

Sno                   0
Device_name           0
Device_description    3
Single-Use            2
dtype: int64

In [11]:
str_dummy_df

Unnamed: 0,Sno,Device_name,Device_description,Single-Use
0,1,Synringe,Used to inject medicine,True
1,2,Ventilator,Used to help patients breath,False
2,3,Surgical Gloves,,True
3,4,Stethescopes,,
4,5,Vials container,,


In [15]:
time_df = pd.read_csv('./datasets/dummy_time.csv')
time_df.head()

Unnamed: 0.1,Unnamed: 0,Sno,Name,Age,Height(cm),birthday
0,0,1,John,25.0,160.0,1994-01-01
1,1,2,Jimmy,26.0,163.0,
2,2,3,Felicia,28.0,154.0,1995-01-01
3,3,4,Sophia,,143.0,
4,4,5,Bob,,,1994-01-01


In [16]:
time_df.dtypes

Unnamed: 0      int64
Sno             int64
Name           object
Age           float64
Height(cm)    float64
birthday       object
dtype: object

In [17]:
time_df.describe()

Unnamed: 0.1,Unnamed: 0,Sno,Age,Height(cm)
count,9.0,9.0,6.0,7.0
mean,4.0,5.0,28.166667,154.857143
std,2.738613,2.738613,2.316607,7.174691
min,0.0,1.0,25.0,143.0
25%,2.0,3.0,26.5,151.0
50%,4.0,5.0,28.5,156.0
75%,6.0,7.0,29.75,160.0
max,8.0,9.0,31.0,163.0


In [18]:
time_df.isna().sum()

Unnamed: 0    0
Sno           0
Name          0
Age           3
Height(cm)    2
birthday      3
dtype: int64

In [19]:
time_df['birthday'] = pd.to_datetime(time_df['birthday'])
time_df.head()

Unnamed: 0.1,Unnamed: 0,Sno,Name,Age,Height(cm),birthday
0,0,1,John,25.0,160.0,1994-01-01
1,1,2,Jimmy,26.0,163.0,NaT
2,2,3,Felicia,28.0,154.0,1995-01-01
3,3,4,Sophia,,143.0,NaT
4,4,5,Bob,,,1994-01-01


missing values in datetime are replaced as `NaT`

In [20]:
time_df.dtypes

Unnamed: 0             int64
Sno                    int64
Name                  object
Age                  float64
Height(cm)           float64
birthday      datetime64[ns]
dtype: object