# Data Cleaning

### Handling Missing Values

In [3]:
import pandas as pd
import numpy as np

In [7]:
q = np.nan # nan or NaN is Not a Number or simply null value

In [5]:
pd.isnull(q) 

True

In [6]:
pd.isna(q) # pd.isna() is same as pd.isnull()

True

***Note: pd.isnull(q) and pd.notnull(q) checks whether the data q is Null or not.***

In [8]:
pd.isnull(pd.Series([1, 2, 3, np.nan, 4, np.nan]))

0    False
1    False
2    False
3     True
4    False
5     True
dtype: bool

In [10]:
pd.isnull(pd.DataFrame({
    'Column 1': [1, 2, 3, np.nan, 4, np.nan], 
    'Column 2': [np.nan, 2, 3, np.nan, 4, 1],
    'Column 3': [np.nan, np.nan, 3, np.nan, 4, np.nan],
}))

Unnamed: 0,Column 1,Column 2,Column 3
0,False,True,True
1,False,False,True
2,False,False,False
3,True,True,True
4,False,False,False
5,True,False,True


In [11]:
pd.DataFrame({
    'Column 1': [1, 2, 3, np.nan, 4, np.nan], 
    'Column 2': [np.nan, 2, 3, np.nan, 4, 1],
    'Column 3': [np.nan, np.nan, 3, np.nan, 4, np.nan],
}).count()

Column 1    4
Column 2    4
Column 3    2
dtype: int64

In [13]:
pd.DataFrame({
    'Column 1': [1, 2, 3, np.nan, 4, np.nan], 
    'Column 2': [np.nan, 2, 3, np.nan, 4, 1],
    'Column 3': [np.nan, np.nan, 3, np.nan, 4, np.nan],
}).sum()

Column 1    10.0
Column 2    10.0
Column 3     7.0
dtype: float64

In [14]:
s = pd.Series([1, 2, 3, np.nan, 4, np.nan])

In [15]:
s.isnull()

0    False
1    False
2    False
3     True
4    False
5     True
dtype: bool

In [16]:
pd.notnull(s).count()

6

In [17]:
pd.notnull(s).sum()

4

In [18]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
4    4.0
dtype: float64

![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

### dropping null values in series

In [20]:
s.dropna()

0    1.0
1    2.0
2    3.0
4    4.0
dtype: float64

![purple-divider](https://user-images.githubusercontent.com/7065401/52071927-c1cd7100-2562-11e9-908a-dde91ba14e59.png)

### Dropping null values in Dataframe

In [21]:
df = pd.DataFrame({
    'Column 1': [1, 2, 3, np.nan, 4, np.nan], 
    'Column 2': [np.nan, 2, 3, np.nan, 4, 1],
    'Column 3': [np.nan, np.nan, 3, np.nan, 4, np.nan],
})

In [22]:
df

Unnamed: 0,Column 1,Column 2,Column 3
0,1.0,,
1,2.0,2.0,
2,3.0,3.0,3.0
3,,,
4,4.0,4.0,4.0
5,,1.0,


In [23]:
df.isnull()

Unnamed: 0,Column 1,Column 2,Column 3
0,False,True,True
1,False,False,True
2,False,False,False
3,True,True,True
4,False,False,False
5,True,False,True


In [24]:
df.isnull().count()

Column 1    6
Column 2    6
Column 3    6
dtype: int64

In [25]:
df.isnull().sum()

Column 1    2
Column 2    2
Column 3    4
dtype: int64

***We can also do df.dropna() like in Series. But in DataFrame the row with at least one NaN will be dropped. Example is shown below:***

In [26]:
df.dropna()

Unnamed: 0,Column 1,Column 2,Column 3
2,3.0,3.0,3.0
4,4.0,4.0,4.0


In [30]:
new_column = pd.Series([5, 6, 7, 8, 10, 20], name='Column 4')
df['Column 4'] = new_column
df

Unnamed: 0,Column 1,Column 2,Column 3,Column 4
0,1.0,,,5
1,2.0,2.0,,6
2,3.0,3.0,3.0,7
3,,,,8
4,4.0,4.0,4.0,10
5,,1.0,,20


***By specifying the axis in the dropna() method, we eliminate the column with NaN value***

In [31]:
df.dropna(axis="columns") 

Unnamed: 0,Column 4
0,5
1,6
2,7
3,8
4,10
5,20


# Cleaning the NOT NULL values

In [33]:
df = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'X', '?'], 
    'Age': [31, 21, 24, 59, 1999],
})
df

Unnamed: 0,Sex,Age
0,M,31
1,F,21
2,F,24
3,X,59
4,?,19


Here in this example, in the sex column the values should be M for male and F for Female. But there are D and ?
which are invalid so we must fix that.