# Session 1: Data Cleaning

## Handling missing data in a Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
nos=pd.Series([10,20,30,np.nan,40])
nos

0    10.0
1    20.0
2    30.0
3     NaN
4    40.0
dtype: float64

## isna(), isnull(), notna(), notnull()

In [3]:
nos.isna()

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [4]:
nos.isnull()

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [5]:
nos.notna()

0     True
1     True
2     True
3    False
4     True
dtype: bool

In [6]:
nos.notnull()

0     True
1     True
2     True
3    False
4     True
dtype: bool

In [7]:
nos.isna().sum()

1

In [8]:
nos.notna().sum()

4

## dropna()

In [9]:
nos.dropna()

0    10.0
1    20.0
2    30.0
4    40.0
dtype: float64

In [10]:
nos

0    10.0
1    20.0
2    30.0
3     NaN
4    40.0
dtype: float64

## Handling null values in a dataframe

In [11]:
df=pd.DataFrame([[10,20,30,np.nan,None],
                 [np.nan,np.nan,np.nan,40,50],
                 [np.nan,np.nan,np.nan,np.nan,np.nan],
                 [10,30,50,70,90]
                ])

In [12]:
df

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10.0,30.0,50.0,70.0,90.0


In [13]:
df.isna()

Unnamed: 0,0,1,2,3,4
0,False,False,False,True,True
1,True,True,True,False,False
2,True,True,True,True,True
3,False,False,False,False,False


In [14]:
df.isna().sum()

0    2
1    2
2    2
3    2
4    2
dtype: int64

In [15]:
df.dropna()

Unnamed: 0,0,1,2,3,4
3,10.0,30.0,50.0,70.0,90.0


In [16]:
df

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10.0,30.0,50.0,70.0,90.0


In [17]:
df.dropna(how='any')

Unnamed: 0,0,1,2,3,4
3,10.0,30.0,50.0,70.0,90.0


In [18]:
df.dropna(how='all')

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
3,10.0,30.0,50.0,70.0,90.0


In [19]:
# Dropping col

df.dropna(axis=1)

0
1
2
3


In [20]:
df

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10.0,30.0,50.0,70.0,90.0


In [21]:
df.dropna(axis=1,how='all')

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10.0,30.0,50.0,70.0,90.0


In [22]:
# Using threshold
df.dropna(thresh=1)

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
3,10.0,30.0,50.0,70.0,90.0


In [23]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
3,10.0,30.0,50.0,70.0,90.0


In [24]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
3,10.0,30.0,50.0,70.0,90.0


In [25]:
df.dropna(thresh=4)

Unnamed: 0,0,1,2,3,4
3,10.0,30.0,50.0,70.0,90.0


## Filling values

Can be done: 1) Using value ; 2) Using method

In [26]:
# Using value
df

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10.0,30.0,50.0,70.0,90.0


In [27]:
df.fillna(value=100)

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,100.0,100.0
1,100.0,100.0,100.0,40.0,50.0
2,100.0,100.0,100.0,100.0,100.0
3,10.0,30.0,50.0,70.0,90.0


In [28]:
df

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10.0,30.0,50.0,70.0,90.0


In [29]:
df.fillna(100)

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,100.0,100.0
1,100.0,100.0,100.0,40.0,50.0
2,100.0,100.0,100.0,100.0,100.0
3,10.0,30.0,50.0,70.0,90.0


In [30]:
# Filling columnwise

df.fillna({0:100,1:200,2:300,3:400,4:500})


Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,400.0,500.0
1,100.0,200.0,300.0,40.0,50.0
2,100.0,200.0,300.0,400.0,500.0
3,10.0,30.0,50.0,70.0,90.0


In [31]:
df

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10.0,30.0,50.0,70.0,90.0


In [32]:
# Fill row 1 by 50
df.loc[1].fillna(value=50)

0    50.0
1    50.0
2    50.0
3    40.0
4    50.0
Name: 1, dtype: float64

In [33]:
# Row 1,2 by 100

df.loc[[1,2]].fillna(100)

Unnamed: 0,0,1,2,3,4
1,100.0,100.0,100.0,40.0,50.0
2,100.0,100.0,100.0,100.0,100.0


In [34]:
# Using method

df

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10.0,30.0,50.0,70.0,90.0


In [35]:
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,10.0,20.0,30.0,40.0,50.0
2,10.0,20.0,30.0,40.0,50.0
3,10.0,30.0,50.0,70.0,90.0


In [36]:
df.fillna(method='bfill')

  df.fillna(method='bfill')


Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,40.0,50.0
1,10.0,30.0,50.0,40.0,50.0
2,10.0,30.0,50.0,70.0,90.0
3,10.0,30.0,50.0,70.0,90.0


In [37]:
# Using statistics

df.fillna(value=df.mean())

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,55.0,70.0
1,10.0,25.0,40.0,40.0,50.0
2,10.0,25.0,40.0,55.0,70.0
3,10.0,30.0,50.0,70.0,90.0


In [38]:
df.fillna(value=df.median())

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,55.0,70.0
1,10.0,25.0,40.0,40.0,50.0
2,10.0,25.0,40.0,55.0,70.0
3,10.0,30.0,50.0,70.0,90.0


## Data Transformation

In [39]:
df1=pd.DataFrame({'C':[1,2,3,4,1,2,1,4],
                  'D':[10,20,30,40,10,20,40,50]})

In [40]:
df1

Unnamed: 0,C,D
0,1,10
1,2,20
2,3,30
3,4,40
4,1,10
5,2,20
6,1,40
7,4,50


In [41]:
df1.duplicated()

0    False
1    False
2    False
3    False
4     True
5     True
6    False
7    False
dtype: bool

In [42]:
df1.drop_duplicates()

Unnamed: 0,C,D
0,1,10
1,2,20
2,3,30
3,4,40
6,1,40
7,4,50


In [43]:
# subset

df1.drop_duplicates(subset=['C'])

Unnamed: 0,C,D
0,1,10
1,2,20
2,3,30
3,4,40


In [44]:
df1.drop_duplicates(subset=['D'])

Unnamed: 0,C,D
0,1,10
1,2,20
2,3,30
3,4,40
7,4,50


### replace

In [45]:
df

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10.0,30.0,50.0,70.0,90.0


In [46]:
df.replace(np.nan,100)

Unnamed: 0,0,1,2,3,4
0,10.0,20.0,30.0,100.0,100.0
1,100.0,100.0,100.0,40.0,50.0
2,100.0,100.0,100.0,100.0,100.0
3,10.0,30.0,50.0,70.0,90.0


In [47]:
df.replace(10,10000)

Unnamed: 0,0,1,2,3,4
0,10000.0,20.0,30.0,,
1,,,,40.0,50.0
2,,,,,
3,10000.0,30.0,50.0,70.0,90.0


## Detecting and Filtering Outliers

In [48]:
np.random.seed(100)

In [49]:
df2=pd.DataFrame(np.random.randn(10,4))
df2

Unnamed: 0,0,1,2,3
0,-1.749765,0.34268,1.153036,-0.252436
1,0.981321,0.514219,0.22118,-1.070043
2,-0.189496,0.255001,-0.458027,0.435163
3,-0.583595,0.816847,0.672721,-0.104411
4,-0.53128,1.029733,-0.438136,-1.118318
5,1.618982,1.541605,-0.251879,-0.842436
6,0.184519,0.937082,0.731,1.361556
7,-0.326238,0.055676,0.2224,-1.443217
8,-0.756352,0.816454,0.750445,-0.455947
9,1.189622,-1.690617,-1.356399,-1.232435


In [54]:
df2[(df2.abs()>1.5).any(axis=1)]

Unnamed: 0,0,1,2,3
0,-1.749765,0.34268,1.153036,-0.252436
5,1.618982,1.541605,-0.251879,-0.842436
9,1.189622,-1.690617,-1.356399,-1.232435


## Dummy Variable- One Hot encoding

In [55]:
df3=pd.DataFrame({'Name':['a','b','c','d','a','b']})
df3

Unnamed: 0,Name
0,a
1,b
2,c
3,d
4,a
5,b


In [56]:
pd.get_dummies(df3)

Unnamed: 0,Name_a,Name_b,Name_c,Name_d
0,True,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,False,False,True
4,True,False,False,False
5,False,True,False,False


In [57]:
pd.get_dummies(df3,dtype=int)

Unnamed: 0,Name_a,Name_b,Name_c,Name_d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
