In [1]:
import pandas as pd

# import data
df = pd.read_csv('raw_data.csv')

In [2]:
# Cleaning Data : Handling Missing Values

# isnull() / isna() : Both methods return a DataFrame of boolean values
# True : missing value (NaN)
# False : non-missing value
df.isnull()

Unnamed: 0,id,name,age,country,gender,income
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,True,False,False,False
4,False,False,False,False,False,True
5,False,False,False,False,False,False
6,False,True,False,False,False,False
7,False,False,False,True,False,False
8,False,False,False,False,False,False
9,False,False,True,False,False,False


In [3]:
# Cleaning Data : Handling Missing Values

# Count missing values column-wise
# Useful to understand which columns have missing data
df.isnull().sum()

id         0
name       1
age        3
country    1
gender     1
income     1
dtype: int64

In [4]:
# Cleaning Data : Handling Missing Values

# Removing missing values

# dropna() : Drops rows that contain at least one missing (NaN) value
# axis=0 is default (rows)
df.dropna()

# dropna(axis=1)
# Drops columns that contain at least one missing (NaN) value
df.dropna(axis=1)

Unnamed: 0,id
0,1
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [5]:
# Cleaning Data : Handling Missing Values

# Create a copy of the original DataFrame
# Original data remains unchanged
cleaned_data = df.copy()

# Handling missing values in 'age' column

# Calculate the mean/average of the 'age' column
# By default, mean() ignores NaN values
age_mean = cleaned_data["age"].mean()

# Replace missing (NaN) values in 'age' column with the mean age
cleaned_data["age"] = cleaned_data["age"].fillna(age_mean)
cleaned_data["age"]

0     29.00
1     29.00
2     32.75
3     32.75
4     34.00
5     27.00
6     45.00
7     38.00
8     29.00
9     32.75
10    31.00
Name: age, dtype: float64

In [6]:
# Cleaning Data : Handling Missing Values

# Handling missing values in 'income' column
salary_mean = cleaned_data["income"].mean()
cleaned_data["income"] = cleaned_data["income"].fillna(salary_mean)

# Handling missing values in 'name' column
cleaned_data["name"] = cleaned_data["name"].fillna("Name")

cleaned_data

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,32.75,Canada,Female,62000.0
3,3,Alex,32.75,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,57600.0
5,5,Li Wei,27.0,China,Male,51000.0
6,6,Name,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,32.75,Mexico,Male,45000.0


In [7]:
country_data = df.loc[4:8, "country"]
print(country_data)

# Forward Fill (ffill) :
# Propagates the last valid value forward to fill NaN values
# NaN values take the value from the previous non-NaN entry
country_data = country_data.ffill()
country_data

4    Spain
5    China
6    India
7      NaN
8      USA
Name: country, dtype: object


4    Spain
5    China
6    India
7    India
8      USA
Name: country, dtype: object

In [8]:
income_data = df.loc[4:8, "income"]
print(income_data)

# Backward Fill (bfill) :
# Propagates the next valid value backward to fill NaN values
# NaN values take the value from the next non-NaN entry
income_data = income_data.bfill()
income_data

4        NaN
5    51000.0
6    73000.0
7    68000.0
8    62000.0
Name: income, dtype: float64


4    51000.0
5    51000.0
6    73000.0
7    68000.0
8    62000.0
Name: income, dtype: float64