# Dataset

In [11]:
import pandas as pd

df = pd.DataFrame({'Name': [None, 'Alex', 'Bob', None, 'Charlie', 'Dave', 'Eve'],
                   'Age': [21, None, 20, None, 30, 24, 25],
                   'Gender': ['Female', 'Male', 'Female', None, None, 'Male', 'Female'],
                   'Grade': [80, 90, 85, None, None, 95, None],
                   'Final Exam Score': [95, 88, 90, None, None, 91, 80]})

df

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
0,,21.0,Female,80.0,95.0
1,Alex,,Male,90.0,88.0
2,Bob,20.0,Female,85.0,90.0
3,,,,,
4,Charlie,30.0,,,
5,Dave,24.0,Male,95.0,91.0
6,Eve,25.0,Female,,80.0


## `fillna():`

- Replace all NaN elements with 0s.

In [12]:
df.fillna(0)

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
0,0,21.0,Female,80.0,95.0
1,Alex,0.0,Male,90.0,88.0
2,Bob,20.0,Female,85.0,90.0
3,0,0.0,0,0.0,0.0
4,Charlie,30.0,0,0.0,0.0
5,Dave,24.0,Male,95.0,91.0
6,Eve,25.0,Female,0.0,80.0


- Dictionary to specify different values to use to fill missing values in different columns

In [13]:
df.fillna({'Name': 'XYZ', 'Age': 18, 'Gender': 'Unknown','Grade' : 50, 'Final Exam Score': 45})

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
0,XYZ,21.0,Female,80.0,95.0
1,Alex,18.0,Male,90.0,88.0
2,Bob,20.0,Female,85.0,90.0
3,XYZ,18.0,Unknown,50.0,45.0
4,Charlie,30.0,Unknown,50.0,45.0
5,Dave,24.0,Male,95.0,91.0
6,Eve,25.0,Female,50.0,80.0


- `Forward fill:`
    - Propagate non-null values forward
    - Used to fill missing values with the value from the previous row

In [14]:
# Notice that 1st row of 'Name' column still has None value
df.fillna(method = 'ffill')

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
0,,21.0,Female,80.0,95.0
1,Alex,21.0,Male,90.0,88.0
2,Bob,20.0,Female,85.0,90.0
3,Bob,20.0,Female,85.0,90.0
4,Charlie,30.0,Female,85.0,90.0
5,Dave,24.0,Male,95.0,91.0
6,Eve,25.0,Female,95.0,80.0


- `Backward fill:`
    - Propagate non-null values backward
    - Used to fill missing values with the value from the next row

In [15]:
# Notice that last row of 'Grade' column still has None value
df.fillna(method = 'bfill')

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
0,Alex,21.0,Female,80.0,95.0
1,Alex,20.0,Male,90.0,88.0
2,Bob,20.0,Female,85.0,90.0
3,Charlie,30.0,Male,95.0,91.0
4,Charlie,30.0,Male,95.0,91.0
5,Dave,24.0,Male,95.0,91.0
6,Eve,25.0,Female,,80.0


- axis argument:
    - By default axis is 0 or `rows` i.e data is filled from previous or next `rows`
    - We can change axis to 1 or `columns` i.e data is filled from previous or next `columns`

In [16]:
df.fillna(method = 'bfill', axis = 'columns')

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
0,21.0,21.0,Female,80.0,95.0
1,Alex,Male,Male,90.0,88.0
2,Bob,20.0,Female,85.0,90.0
3,,,,,
4,Charlie,30.0,,,
5,Dave,24.0,Male,95.0,91.0
6,Eve,25.0,Female,80.0,80.0


## `interpolate():`

- `Linear:`
    - This method uses linear interpolation to fill missing values based on the values of the surrounding rows
    - It is the default method

In [17]:
df.interpolate()

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
0,,21.0,Female,80.0,95.0
1,Alex,20.5,Male,90.0,88.0
2,Bob,20.0,Female,85.0,90.0
3,,25.0,,88.333333,90.333333
4,Charlie,30.0,,91.666667,90.666667
5,Dave,24.0,Male,95.0,91.0
6,Eve,25.0,Female,95.0,80.0


## `dropna():`

- Drop rows which have `at least 1` NA value

In [18]:
df.dropna()
# OR
# df.dropna(how = 'any') # By default how = any

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
2,Bob,20.0,Female,85.0,90.0
5,Dave,24.0,Male,95.0,91.0


- Drop rows only if `all` columns have NA value

In [19]:
df.dropna(how = 'all')

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
0,,21.0,Female,80.0,95.0
1,Alex,,Male,90.0,88.0
2,Bob,20.0,Female,85.0,90.0
4,Charlie,30.0,,,
5,Dave,24.0,Male,95.0,91.0
6,Eve,25.0,Female,,80.0


- Drop rows with `fewer than 3 non-NA` values

In [20]:
df.dropna(thresh = 3)

Unnamed: 0,Name,Age,Gender,Grade,Final Exam Score
0,,21.0,Female,80.0,95.0
1,Alex,,Male,90.0,88.0
2,Bob,20.0,Female,85.0,90.0
5,Dave,24.0,Male,95.0,91.0
6,Eve,25.0,Female,,80.0
