In [22]:
import pandas as pd
from pathlib import Path
import sys

# import cleaning helpers
sys.path.append("src")
from src.cleaning import fill_missing_median, drop_missing, normalize_data

BASE = Path("data")
RAW = BASE / "raw"
PROCESSED = BASE / "processed"
PROCESSED.mkdir(parents=True, exist_ok=True)

RAW_FILE = next(RAW.glob("*.csv"))  # pick the latest raw CSV you produced in Stage 4
df_raw = pd.read_csv(RAW_FILE, low_memory=False)

print("Using raw file:", RAW_FILE)
df_raw.info()
df_raw.head()

Using raw file: data/raw/api_yfinance_GOOG_20250827-0320.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           10 non-null     int64  
 1   date         10 non-null     object 
 2   temperature  7 non-null      float64
 3   humidity     7 non-null      float64
 4   price        9 non-null      object 
 5   category     10 non-null     object 
dtypes: float64(2), int64(1), object(3)
memory usage: 608.0+ bytes


Unnamed: 0,id,date,temperature,humidity,price,category
0,1,2023-01-01,25.3,45.0,100.0,A
1,2,2023-01-02,,55.0,105.0,B
2,3,2023-01-03,27.1,,,A
3,4,2023-01-04,29.0,60.0,112.0,B
4,5,2023-01-05,,65.0,115.0,C


In [23]:
# 1) fill numeric NAs with median
df1 = fill_missing_median(df_raw)

# 2) drop remaining rows with any NA (simple, easy to explain)
df2 = drop_missing(df1, how="any")

# 3) normalize numeric columns (use 'standard' per reading; switch to 'minmax' if you prefer)
df_clean = normalize_data(df2, method="standard")

In [24]:
print("Original shape:", df_raw.shape)
print("Cleaned  shape:", df_clean.shape)

print("\nMissing per column (original):")
print(df_raw.isna().sum())

print("\nMissing per column (cleaned):")
print(df_clean.isna().sum())

num_cols = df_raw.select_dtypes(include="number").columns
if len(num_cols):
    col = num_cols[0]
    print(f"\nExample column: {col}")
    print(f"Original mean/std: {df_raw[col].mean():.3f} / {df_raw[col].std(ddof=0):.3f}")
    print(f"Cleaned  mean/std: {df_clean[col].mean():.3f} / {df_clean[col].std(ddof=0):.3f}")

Original shape: (10, 6)
Cleaned  shape: (9, 6)

Missing per column (original):
id             0
date           0
temperature    3
humidity       3
price          1
category       0
dtype: int64

Missing per column (cleaned):
id             0
date           0
temperature    0
humidity       0
price          0
category       0
dtype: int64

Example column: id
Original mean/std: 5.500 / 2.872
Cleaned  mean/std: 0.000 / 1.000


In [25]:
out_path = PROCESSED / "cleaned.csv"
df_clean.to_csv(out_path, index=False)
print("Saved:", out_path)

Saved: data/processed/cleaned.csv
