# Data Cleaning 

In [1]:
import numpy as np
import pandas as pd

## Falsy, NaN and Infinite Values on Numpy

In [2]:
falsy_values = (0, False, None, '', [], {})

In [3]:
any(falsy_values)

False

In [4]:
a = np.array([1, 2, 3, np.nan, np.nan, 4])

### Tests on NaN 

In [5]:
a.sum()

nan

In [8]:
a.min()

nan

In [9]:
a.max()

nan

In [11]:
a = np.array([1, 2, 3, np.nan, None, 4], dtype='float')

In [12]:
a

array([ 1.,  2.,  3., nan, nan,  4.])

In [13]:
a.sum()

nan

In [14]:
a.mean()

nan

### Tests on Infinite

In [15]:
np.inf

inf

In [16]:
3 + np.inf

inf

In [17]:
np.inf / 3

inf

In [18]:
np.inf / np.inf

nan

### Combined Tests

In [20]:
np.isnan(np.nan)

True

In [21]:
np.isinf(np.inf)

True

In [22]:
np.isinf(np.nan)

False

In [23]:
np.isfinite(np.nan), np.isfinite(np.inf)

(False, False)

### Excluding NaN and Infinite Values

In [24]:
np.isnan(np.array([1, 2, 3, np.nan, np.inf, 4]))

array([False, False, False,  True, False, False])

In [25]:
a[~np.isnan(a)]

array([1., 2., 3., 4.])

In [26]:
a[np.isfinite(a)]

array([1., 2., 3., 4.])

In [27]:
a[np.isfinite(a)].sum()


10.0

In [37]:
a[np.isfinite(a)].min()

1.0

In [38]:
a[~np.isnan(a)].mean()

2.5

## Utility Functions on Pandas

### NaN (Null) Values

In [29]:
pd.isnull(np.nan)

True

In [30]:
pd.isnull(None)

True

In [31]:
pd.isna(np.nan)

True

In [33]:
pd.isna(None)

True

### Opposite Queries 

In [41]:
pd.notnull(np.nan)

False

In [34]:
pd.notnull(None)

False

In [40]:
pd.notna(np.nan)

False

In [42]:
pd.notna(None)

False

### Tests on Series

In [44]:
pd.isnull(pd.Series([1, 10, None, np.nan]))

0    False
1    False
2     True
3     True
dtype: bool

In [45]:
pd.notnull(pd.Series([1, 10, None, np.nan]))

0     True
1     True
2    False
3    False
dtype: bool

In [176]:
pd.Series([1, np.nan]).isnull().all() 
# Also checks if all the outputs are true

False

In [177]:
pd.Series([1, np.nan]).isnull().any()
# Also checks if any of the outputs are true

True

### Tests on DataFrame

In [49]:
pd.DataFrame({
    'Column X': [4, 6, np.nan],
    'Column Y': [np.nan, None, 2],
    'Column Z': [3, 9, 7]
},
   index= ['Row A', 'Row B', 'Row C']                   
)

Unnamed: 0,Column X,Column Y,Column Z
Row A,4.0,,3
Row B,6.0,,9
Row C,,2.0,7


In [50]:
pd.isnull(pd.DataFrame({
    'Column X': [4, 6, np.nan],
    'Column Y': [np.nan, None, 2],
    'Column Z': [3, 9, 7]
},
   index= ['Row A', 'Row B', 'Row C']                   
))

Unnamed: 0,Column X,Column Y,Column Z
Row A,False,True,False
Row B,False,True,False
Row C,True,False,False


### Queries on NaN (Null) Values

In [64]:
pd.Series([1, 2, np.nan]).sum()

3.0

In [53]:
pd.Series([1, 2, np.nan]).mean()

1.5

In [54]:
pd.Series([1, 2, np.nan]).count()

2

### Tests on Regular and Opposite Queries

In [69]:
series = pd.Series([None, np.nan, None, 5, 3])

In [70]:
pd.isnull(series)

0     True
1     True
2     True
3    False
4    False
dtype: bool

In [71]:
pd.notnull(series)

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [72]:
pd.isnull(series).sum()

3

In [73]:
pd.notnull(series).sum()

2

In [74]:
series.sum()

8.0

In [75]:
series[pd.notnull(series)]

3    5.0
4    3.0
dtype: float64

In [76]:
series[pd.isnull(series)]

0   NaN
1   NaN
2   NaN
dtype: float64

In [79]:
series.isnull()

0     True
1     True
2     True
3    False
4    False
dtype: bool

In [80]:
series.notnull()

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [82]:
series[series.isnull()]

0   NaN
1   NaN
2   NaN
dtype: float64

In [81]:
series[series.notnull()]

3    5.0
4    3.0
dtype: float64

### Dropping NaN (Null) Values from Series and DataFrame

In [84]:
series

0    NaN
1    NaN
2    NaN
3    5.0
4    3.0
dtype: float64

In [86]:
series.dropna()

3    5.0
4    3.0
dtype: float64

In [93]:
dataFrame = (pd.DataFrame({
    'Column K': [None, 6, 41, np.nan],
    'Column L': [np.nan, None, 2, 63],
    'Column M': [86, 12, 9, None]
},
   index= ['Row W', 'Row X', 'Row Y', 'Row Z']                   
))

In [94]:
dataFrame

Unnamed: 0,Column K,Column L,Column M
Row W,,,86.0
Row X,6.0,,12.0
Row Y,41.0,2.0,9.0
Row Z,,63.0,


In [96]:
dataFrame.shape

(4, 3)

In [97]:
dataFrame.isnull()

Unnamed: 0,Column K,Column L,Column M
Row W,True,True,False
Row X,False,True,False
Row Y,False,False,False
Row Z,True,False,True


In [100]:
dataFrame.isnull().count()

Column K    4
Column L    4
Column M    4
dtype: int64

In [101]:
dataFrame.isnull().sum()

Column K    2
Column L    2
Column M    1
dtype: int64

In [105]:
dataFrame.dropna() # dropna function only returns the rows with no NaN values in DataFrame structure.

Unnamed: 0,Column K,Column L,Column M
Row Y,41.0,2.0,9.0


In [107]:
dataFrame.dropna(axis=1) # dropna function only returns the columns with no NaN values in DataFrame structure. 
# It is empty because we don't have a column in this condition.

Row W
Row X
Row Y
Row Z


In [113]:
dataFrame.dropna(how = 'all') # We can use 'how' in order to see the data frame in right way.

Unnamed: 0,Column K,Column L,Column M
Row W,,,86.0
Row X,6.0,,12.0
Row Y,41.0,2.0,9.0
Row Z,,63.0,


In [112]:
dataFrame.dropna(how = 'any') # This is the default behaviour that we saw earlier

Unnamed: 0,Column K,Column L,Column M
Row Y,41.0,2.0,9.0


In [131]:
dataFrame.dropna(thresh = 0)
# We can set a threshold number to keep the rows and columns if the quantity of non-null values are greater than the specified value.

Unnamed: 0,Column K,Column L,Column M
Row W,,,86.0
Row X,6.0,,12.0
Row Y,41.0,2.0,9.0
Row Z,,63.0,


In [130]:
dataFrame.dropna(thresh = 2) # It gives the row with at least 2 non-null values.

Unnamed: 0,Column K,Column L,Column M
Row X,6.0,,12.0
Row Y,41.0,2.0,9.0


In [138]:
dataFrame.dropna(thresh=2, axis='columns') # It does the same operation but for columns.

Unnamed: 0,Column K,Column L,Column M
Row W,,,86.0
Row X,6.0,,12.0
Row Y,41.0,2.0,9.0
Row Z,,63.0,


### Filling NaN (Null) Values

In [139]:
series

0    NaN
1    NaN
2    NaN
3    5.0
4    3.0
dtype: float64

In [142]:
series.fillna(0) # Automatically fill the null values with 0

0    0.0
1    0.0
2    0.0
3    5.0
4    3.0
dtype: float64

In [150]:
series.ffill() # Null values take the form of the value in previous row

0    NaN
1    NaN
2    NaN
3    5.0
4    3.0
dtype: float64

In [151]:
series.bfill() # Null values take the form of the value in next row

0    5.0
1    5.0
2    5.0
3    5.0
4    3.0
dtype: float64

In [154]:
pd.Series([np.nan, 3, np.nan, 9]).ffill()

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [155]:
pd.Series([np.nan, 3, np.nan, 9]).bfill()

0    3.0
1    3.0
2    9.0
3    9.0
dtype: float64

### Filling NaN (Null) Values in DataFrame

In [156]:
dataFrame

Unnamed: 0,Column K,Column L,Column M
Row W,,,86.0
Row X,6.0,,12.0
Row Y,41.0,2.0,9.0
Row Z,,63.0,


In [158]:
dataFrame.fillna({'Column K': 1, 'Column L': 2, 'Column M': 3})
# You can fill the null values in column axis with specified values.

Unnamed: 0,Column K,Column L,Column M
Row W,1.0,2.0,86.0
Row X,6.0,2.0,12.0
Row Y,41.0,2.0,9.0
Row Z,1.0,63.0,3.0


In [161]:
dataFrame.ffill(axis= 0)
# Use ffill method in axis 0

Unnamed: 0,Column K,Column L,Column M
Row W,,,86.0
Row X,6.0,,12.0
Row Y,41.0,2.0,9.0
Row Z,41.0,63.0,9.0


In [163]:
dataFrame.bfill(axis= 0)
# Use bfill method in axis 0

Unnamed: 0,Column K,Column L,Column M
Row W,6.0,2.0,86.0
Row X,6.0,2.0,12.0
Row Y,41.0,2.0,9.0
Row Z,,63.0,


### Counting and Checking the Null Values

In [165]:
series.dropna().count()
# See how many null values are dropped

2

In [169]:
len(series)

5

In [178]:
series.count() # Excludes NaN values

2