# Sales Data Preprocessing
This notebook keeps only required columns, cleans data, and adds Year/Month/Quarter.

In [None]:
import pandas as pd
from pathlib import Path
base = Path('..')/ 'data'
raw_path = base/'raw'/'sales_sample.csv'
out_path = base/'processed'/'sales_clean.csv'

# Load raw
use_cols = ['ORDERNUMBER','SALES','ORDERDATE','PRODUCTLINE','COUNTRY','QUANTITYORDERED','CUSTOMERNAME','STATUS']
df = pd.read_csv(raw_path, usecols=use_cols)

# Parse dates and create Year/Month/Quarter
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'], errors='coerce')
df = df.dropna(subset=['ORDERDATE'])
df['Year'] = df['ORDERDATE'].dt.year
df['Month'] = df['ORDERDATE'].dt.to_period('M').astype(str)
df['Quarter'] = df['ORDERDATE'].dt.to_period('Q').astype(str)

# Normalize categories
df['STATUS'] = df['STATUS'].str.strip().str.title()
df['PRODUCTLINE'] = df['PRODUCTLINE'].str.strip().str.title()

# Remove duplicates if any
df = df.drop_duplicates()

# Save cleaned
out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
df.head()