In [1]:
import numpy as np
import pandas as pd

# Data Cleaning & Preparation

In [4]:
# sample DataFrame
df = pd.DataFrame({
    "Name": ["Onkar", "Amit", "Sara", "Neha"],
    "Age": [21, np.nan, 23, np.nan],
    "City": ["Pune", "Mumbai", None, "Nagpur"],
    "Score": [88, 92, None, 85]
})

df

Unnamed: 0,Name,Age,City,Score
0,Onkar,21.0,Pune,88.0
1,Amit,,Mumbai,92.0
2,Sara,23.0,,
3,Neha,,Nagpur,85.0


## 1. Detect missing values

Points:  
1. `.isnull()` -> Give T/F DF T-Null val & F-NotNull(have data in that cell).  
2. `.isnull().sum()` -> Give Count of null values in each column.
3. `.isnull().sum().sum()` -> Gives total null value count in entire DataFrame

** `.notnull()` is opposite to `.isnull()`.  

In [6]:
df.isnull()
# It give full dataframe in the form of T/F.  
# True - the value in DataFrame is null.
# False - the value in DataFrame is Not null.

Unnamed: 0,Name,Age,City,Score
0,False,False,False,False
1,False,True,False,False
2,False,False,True,True
3,False,True,False,False


In [9]:
df.isnull().sum()
# Count of Null values in each column

Name     0
Age      2
City     1
Score    1
dtype: int64

In [10]:
df.isnull().sum().sum()

np.int64(4)

In [16]:
# opposite to .isnull()
df.notnull(), df.notnull().sum(), df.notnull().sum().sum()

(   Name    Age   City  Score
 0  True   True   True   True
 1  True  False   True   True
 2  True   True  False  False
 3  True  False   True   True,
 Name     4
 Age      2
 City     3
 Score    3
 dtype: int64,
 np.int64(12))

## 2. Remove missing values -> `.dropna()`

### Points:
1. `.dropna()` -> Drops rows which contain null value
2. `.dropna(how='all')` -> Drops only those rows which entirely empty
3. `.dropna(subset=[...])` -> drops rows if the column specified have null value

#### Note: 
`.dropna()` creates a new DataFrame.  
To do changes inplace use `.dropna(inplace=True)`

In [19]:
# drops every row which contain any Null value
df.dropna()

Unnamed: 0,Name,Age,City,Score
0,Onkar,21.0,Pune,88.0


In [20]:
# drop row which entire row is empty
df.dropna(how='all')

Unnamed: 0,Name,Age,City,Score
0,Onkar,21.0,Pune,88.0
1,Amit,,Mumbai,92.0
2,Sara,23.0,,
3,Neha,,Nagpur,85.0


In [21]:
# drop specfic row which have null valus
df.dropna(subset=["Age"])

Unnamed: 0,Name,Age,City,Score
0,Onkar,21.0,Pune,88.0
2,Sara,23.0,,


## 3. Fill missing values -> `.fillna()`

### Points:
1. `.fillna(0)` -> fill with constant value
2. `.fillna("XXXXXX")`-> fill with string  
Use int for int column.  
*real world ex*
    1. `df[...].fillna(df[...].mean())` - Fill with mean
    1. `df[...].fillna(df[...].mediam())` - Fill with median
3. `.ffill()` -> For forward fill
4. `.bfill()` -> for backward fill

In [35]:
# fill with constant value
df.fillna(0)

Unnamed: 0,Name,Age,City,Score
0,Onkar,21.0,Pune,88.0
1,Amit,0.0,Mumbai,92.0
2,Sara,23.0,0,0.0
3,Neha,0.0,Nagpur,85.0


In [26]:
# fill numeric column with mean
df["Age"], df["Age"].fillna(df["Age"].mean())

(0    21.0
 1     NaN
 2    23.0
 3     NaN
 Name: Age, dtype: float64,
 0    21.0
 1    22.0
 2    23.0
 3    22.0
 Name: Age, dtype: float64)

In [28]:
# fill with median
df["Age"], df["Age"].fillna(df["Age"].median())

(0    21.0
 1     NaN
 2    23.0
 3     NaN
 Name: Age, dtype: float64,
 0    21.0
 1    22.0
 2    23.0
 3    22.0
 Name: Age, dtype: float64)

In [30]:
# fill string string/object column with placeholder
df["City"], df["City"].fillna("XXXXXXX")

(0      Pune
 1    Mumbai
 2      None
 3    Nagpur
 Name: City, dtype: object,
 0       Pune
 1     Mumbai
 2    XXXXXXX
 3     Nagpur
 Name: City, dtype: object)

In [33]:
# fill using fordward fill (previous value)
df.ffill()

Unnamed: 0,Name,Age,City,Score
0,Onkar,21.0,Pune,88.0
1,Amit,21.0,Mumbai,92.0
2,Sara,23.0,Mumbai,92.0
3,Neha,23.0,Nagpur,85.0


In [39]:
# fill backword (next value)
df.bfill()

Unnamed: 0,Name,Age,City,Score
0,Onkar,21.0,Pune,88.0
1,Amit,21.0,Mumbai,92.0
2,Sara,23.0,Mumbai,92.0
3,Neha,23.0,Nagpur,85.0


In [40]:
# Fill inplace directly
df.ffill(inplace=True)
df

Unnamed: 0,Name,Age,City,Score
0,Onkar,21.0,Pune,88.0
1,Amit,21.0,Mumbai,92.0
2,Sara,23.0,Mumbai,92.0
3,Neha,23.0,Nagpur,85.0


In [42]:
df.isnull().sum().sum()
# 0 cause we forward fill df inplace 

np.int64(0)