# 🧩 Notebook 02: Data Cleaning and Preprocessing with pandas

In [1]:
# Notebook import setup
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from scripts import utils_io

In [2]:
# Standard imports
import pandas as pd
import numpy as np
from pathlib import Path

# Cleaning utilities from scripts folder
from scripts.cleaning_utils import clean_dataframe, detect_outliers_iqr

# Data path
ASSETS_DIR = Path("../assets")

## Superstore Cleaned CSV

### 🧹 Load Superstore Cleaned CSV

In [3]:
# Load cleaned Superstore data
df = utils_io.load_csv("../assets/superstore_cleaned.csv")
df.head()

Unnamed: 0,order_id,customer_id,customer_name,segment,region,order_date,ship_date,category,sub_category,product_name,sales,quantity,discount,profit
0,ORD-10000,CUST-9476,Mr. Michael Lopez,Home Office,Central,2020-01-01,2020-01-03,Furniture,Bookcases,Bookcases Model 1,1292.63,5,0.3,22.94
1,ORD-10001,CUST-9162,Robert Liu,Home Office,South,2020-01-02,2020-01-04,Furniture,Bookcases,Bookcases Model 2,1947.16,2,0.3,205.52
2,ORD-10002,CUST-3824,Nicole Bowman,Consumer,South,2020-01-03,2020-01-05,Furniture,Bookcases,Bookcases Model 3,1774.42,3,0.2,23.84
3,ORD-10003,CUST-8888,Stephen Flores,Consumer,Central,2020-01-04,2020-01-06,Furniture,Bookcases,Bookcases Model 4,591.01,8,0.0,64.85
4,ORD-10004,CUST-9980,Stephen Rodriguez,Corporate,South,2020-01-05,2020-01-07,Furniture,Bookcases,Bookcases Model 5,1969.55,4,0.2,58.99


### 🔍 Missing Value Analysis

In [4]:
# Total and percentage missing
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({"Missing": missing, "Missing (%)": missing_pct})
missing_df[missing_df["Missing"] > 0]

Unnamed: 0,Missing,Missing (%)


### 🧼 Handle Missing Values

In [5]:
# If any date or numerical column has NaNs (rare in this case)
df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
df['ship_date'] = pd.to_datetime(df['ship_date'], errors='coerce')

# Fill or drop rows with missing dates (example)
df = df.dropna(subset=['order_date', 'ship_date'])  # or df.fillna(method='ffill')

### ✂️ Clean String Columns

In [6]:
# Clean up whitespace and casing for object/categorical columns
string_cols = df.select_dtypes(include=['object', 'category']).columns

for col in string_cols:
    df[col] = df[col].astype(str).str.strip().str.lower().str.replace(r"\s+", " ", regex=True)

### 📄 Remove Duplicates

In [7]:
# Check for duplicates
print("Total duplicate rows:", df.duplicated().sum())

# Drop duplicates
df = df.drop_duplicates()

Total duplicate rows: 0


### 🚨 Outlier Detection (IQR Method)

In [8]:
# Detect outliers in numeric columns using IQR
numeric_cols = ['sales', 'profit', 'quantity', 'discount']

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    print(f"{col}: {len(outliers)} outliers")

sales: 0 outliers
profit: 371 outliers
quantity: 0 outliers
discount: 0 outliers


### Save Final Cleaned Superstore Data

In [9]:
# Saving the final cleaned dataset
utils_io.save_csv(df, "../assets/superstore_final.csv")

## Weather Data

In [10]:
# Load
weather_df = utils_io.load_csv("../assets/weather_cleaned.csv")

In [11]:
# Inspect
weather_df.info()
print("Missing values:", weather_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           10000 non-null  object
 1   temperature_c  10000 non-null  int64 
 2   humidity       10000 non-null  int64 
 3   condition      10000 non-null  object
dtypes: int64(2), object(2)
memory usage: 312.6+ KB
Missing values: date             0
temperature_c    0
humidity         0
condition        0
dtype: int64


In [12]:
# Clean
weather_df = clean_dataframe(weather_df, drop_na_cols=['date'])

In [13]:
# Outliers
outliers_temp = detect_outliers_iqr(weather_df, 'temperature_c')
print("Temperature outliers:", len(outliers_temp))

Temperature outliers: 0


In [14]:
# Save
utils_io.save_csv(weather_df, "../assets/weather_final.csv")

## Bank Loan Data

In [15]:
# Load the dataset
loan_df = utils_io.load_csv("../assets/loan_cleaned.csv")

In [16]:
# Inspect
loan_df.info()
print("Missing values:", loan_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customer_id    10000 non-null  int64 
 1   customer_name  10000 non-null  object
 2   age            10000 non-null  int64 
 3   income         10000 non-null  int64 
 4   loan_amount    10000 non-null  int64 
 5   loan_purpose   10000 non-null  object
 6   approved       10000 non-null  object
dtypes: int64(4), object(3)
memory usage: 547.0+ KB
Missing values: customer_id      0
customer_name    0
age              0
income           0
loan_amount      0
loan_purpose     0
approved         0
dtype: int64


In [17]:
# Clean
loan_df = clean_dataframe(loan_df, drop_na_cols=['income', 'loan_amount'])

In [18]:
# Outliers
outliers_income = detect_outliers_iqr(loan_df, 'income')
outliers_loan = detect_outliers_iqr(loan_df, 'loan_amount')
print(f"Outliers — Income: {len(outliers_income)}, Loan: {len(outliers_loan)}")

Outliers — Income: 0, Loan: 0


In [19]:
# Save
utils_io.save_csv(loan_df, "../assets/loan_final.csv")

## Multi-Sheet Bank Loans (East)

In [20]:
# Load the dataset
loan_east_df = utils_io.load_csv("../assets/loan_cleaned_east.csv")

In [21]:
# Inspect
loan_east_df.info()
print("Missing values:", loan_east_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customer_id    10000 non-null  int64 
 1   customer_name  10000 non-null  object
 2   age            10000 non-null  int64 
 3   income         10000 non-null  int64 
 4   loan_amount    10000 non-null  int64 
 5   loan_purpose   10000 non-null  object
 6   approved       10000 non-null  object
 7   region         10000 non-null  object
dtypes: int64(4), object(4)
memory usage: 625.1+ KB
Missing values: customer_id      0
customer_name    0
age              0
income           0
loan_amount      0
loan_purpose     0
approved         0
region           0
dtype: int64


In [22]:
# Clean
loan_east_df = clean_dataframe(loan_east_df, drop_na_cols=['income', 'loan_amount'])

In [23]:
# Outliers
outliers_income_east = detect_outliers_iqr(loan_east_df, 'income')
print("East Region – Income Outliers:", len(outliers_income_east))

East Region – Income Outliers: 0


In [24]:
# Save
utils_io.save_csv(loan_east_df, "../assets/loan_final_east.csv")

## Multi-Sheet Bank Loans (West)

In [25]:
# Load the dataset
loan_west_df = utils_io.load_csv("../assets/loan_cleaned_west.csv")

In [26]:
# Inspect
loan_west_df.info()
print("Missing values:", loan_west_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customer_id    10000 non-null  int64 
 1   customer_name  10000 non-null  object
 2   age            10000 non-null  int64 
 3   income         10000 non-null  int64 
 4   loan_amount    10000 non-null  int64 
 5   loan_purpose   10000 non-null  object
 6   approved       10000 non-null  object
 7   region         10000 non-null  object
dtypes: int64(4), object(4)
memory usage: 625.1+ KB
Missing values: customer_id      0
customer_name    0
age              0
income           0
loan_amount      0
loan_purpose     0
approved         0
region           0
dtype: int64


In [27]:
# Clean
loan_west_df = clean_dataframe(loan_west_df, drop_na_cols=['income', 'loan_amount'])

In [28]:
# Outliers
outliers_income_west = detect_outliers_iqr(loan_west_df, 'income')
print("West Region – Income Outliers:", len(outliers_income_west))

West Region – Income Outliers: 0


In [29]:
# Save
utils_io.save_csv(loan_west_df, "../assets/loan_final_west.csv")

## Multi-Sheet Bank Loans (North)

In [30]:
# Load the dataset
loan_north_df = utils_io.load_csv("../assets/loan_cleaned_north.csv")

In [31]:
# Inspect
loan_north_df.info()
print("Missing values:", loan_north_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customer_id    10000 non-null  int64 
 1   customer_name  10000 non-null  object
 2   age            10000 non-null  int64 
 3   income         10000 non-null  int64 
 4   loan_amount    10000 non-null  int64 
 5   loan_purpose   10000 non-null  object
 6   approved       10000 non-null  object
 7   region         10000 non-null  object
dtypes: int64(4), object(4)
memory usage: 625.1+ KB
Missing values: customer_id      0
customer_name    0
age              0
income           0
loan_amount      0
loan_purpose     0
approved         0
region           0
dtype: int64


In [32]:
# Clean
loan_north_df = clean_dataframe(loan_north_df, drop_na_cols=['income', 'loan_amount'])

In [33]:
# Outliers
outliers_income_north = detect_outliers_iqr(loan_north_df, 'income')
print("North Region – Income Outliers:", len(outliers_income_north))

North Region – Income Outliers: 0


In [34]:
# Save
utils_io.save_csv(loan_north_df, "../assets/loan_final_north.csv")

## Multi-Sheet Bank Loans (South)

In [35]:
# Load the dataset
loan_south_df = utils_io.load_csv("../assets/loan_cleaned_south.csv")

In [36]:
# Inspect
loan_south_df.info()
print("Missing values:", loan_south_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customer_id    10000 non-null  int64 
 1   customer_name  10000 non-null  object
 2   age            10000 non-null  int64 
 3   income         10000 non-null  int64 
 4   loan_amount    10000 non-null  int64 
 5   loan_purpose   10000 non-null  object
 6   approved       10000 non-null  object
 7   region         10000 non-null  object
dtypes: int64(4), object(4)
memory usage: 625.1+ KB
Missing values: customer_id      0
customer_name    0
age              0
income           0
loan_amount      0
loan_purpose     0
approved         0
region           0
dtype: int64


In [37]:
# Clean
loan_south_df = clean_dataframe(loan_south_df, drop_na_cols=['income', 'loan_amount'])

In [38]:
# Outlier
outliers_income_south = detect_outliers_iqr(loan_south_df, 'income')
print("South Region – Income Outliers:", len(outliers_income_south))

South Region – Income Outliers: 0


In [39]:
# Save
utils_io.save_csv(loan_south_df, "../assets/loan_final_south.csv")

## COVID Data

In [40]:
# Load the dataset
covid_df = utils_io.load_csv("../assets/covid_cleaned.csv")

In [41]:
# Inspect
covid_df.info()
print("Missing values:", covid_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          10000 non-null  object
 1   country       10000 non-null  object
 2   variant       10000 non-null  object
 3   new_cases     10000 non-null  int64 
 4   new_deaths    10000 non-null  int64 
 5   hospitalized  10000 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 468.9+ KB
Missing values: date            0
country         0
variant         0
new_cases       0
new_deaths      0
hospitalized    0
dtype: int64


In [42]:
# Clean
covid_df = clean_dataframe(covid_df, drop_na_cols=['date'])

In [43]:
# Outlier
outliers_cases = detect_outliers_iqr(covid_df, 'new_cases')
outliers_deaths = detect_outliers_iqr(covid_df, 'new_deaths')
print(f"COVID Outliers — Cases: {len(outliers_cases)}, Deaths: {len(outliers_deaths)}")

COVID Outliers — Cases: 56, Deaths: 89


In [44]:
# Save
utils_io.save_csv(covid_df, "../assets/covid_final.csv")

## ✅ Summary of Module 02: Data Cleaning

We have now:

- Cleaned and standardized **all 5 datasets**
- Removed duplicates and handled missing values
- Cleaned strings across object/categorical columns
- Detected (but did not delete) outliers using the IQR method
- Saved final cleaned files in `../assets/` for downstream analysis

📦 Next up: Aggregation, grouping, and reshaping in `03_aggregation_grouping.ipynb`