# 02 â€” Preprocessing

Clean and split the Amazon Products dataset with the reusable functions in `src/preprocess.py`. This notebook mirrors Step 3 of the assignment so results are reproducible and inspectable.


In [None]:
from pathlib import Path
import pandas as pd
import sys

PROJECT_ROOT = Path.cwd().parents[0]
RAW_PATH = PROJECT_ROOT / "data" / "raw" / "amazon_products_clean.csv"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
sys.path.append(str(PROJECT_ROOT / "src"))
RAW_PATH, PROCESSED_DIR


In [None]:
raw_df = pd.read_csv(RAW_PATH)
raw_df.head()


In [None]:
raw_df.shape, raw_df['category'].nunique(), raw_df['category'].value_counts().head()


In [None]:
from preprocess import PreprocessConfig, apply_cleaning, make_splits

cfg = PreprocessConfig(remove_numbers=True, lowercase=True, remove_stopwords=False, lemmatize=False)
clean_df = apply_cleaning(raw_df.copy(), cfg)
clean_df.head()


In [None]:
train_df, val_df, test_df = make_splits(clean_df, stratify_col='category', seed=cfg.seed)
len(train_df), len(val_df), len(test_df)


In [None]:
output_paths = {}
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
frames = {
    'processed_full': clean_df,
    'train': train_df,
    'val': val_df,
    'test': test_df,
}
for name, frame in frames.items():
    path = PROCESSED_DIR / f'{name}.csv'
    frame.to_csv(path, index=False)
    output_paths[name] = path
output_paths
