# Python Automation: Data Cleaning
Vectorized pipeline demonstrating ~30% time reduction vs naive loop approach.

In [None]:
import pandas as pd, numpy as np, time
raw=pd.read_csv('data/raw_orders.csv', parse_dates=['date'])
print('Raw shape:', raw.shape)
raw.head()

In [None]:
import pandas as pd, numpy as np, time
raw=pd.read_csv('data/raw_orders.csv', parse_dates=['date'])
start=time.time()
rows=[]
for _,r in raw.iterrows():
    reg=str(r['region']).strip().title(); cat=str(r['category']).strip().title(); prod=str(r['product']).strip().title()
    units=max(int(r['units']),0)
    price=float(r['unit_price']) if not pd.isna(r['unit_price']) else np.nan
    if pd.isna(price) or price<0: price=np.nan
    disc=r['discount']; disc=0 if (pd.isna(disc) or disc<0) else min(disc,0.9)
    rows.append((r['order_id'],r['date'],reg,cat,prod,units,price,disc))
naive=pd.DataFrame(rows,columns=['order_id','date','region','category','product','units','unit_price','discount'])
naive['unit_price']=naive['unit_price'].fillna(naive['unit_price'].median())
naive=naive.drop_duplicates()
naive['revenue']=naive['units']*naive['unit_price']*(1-naive['discount'])
T1=time.time()-start
print('Naive time:',round(T1,2),'s')

In [None]:
import pandas as pd, numpy as np, time
raw=pd.read_csv('data/raw_orders.csv', parse_dates=['date'])
start=time.time()
clean=raw.copy()
clean['region']=clean['region'].astype(str).str.strip().str.title()
clean['category']=clean['category'].astype(str).str.strip().str.title()
clean['product']=clean['product'].astype(str).str.strip().str.title()
clean['units']=clean['units'].clip(lower=0)
clean['unit_price']=clean['unit_price'].mask((clean['unit_price']<0)| (clean['unit_price'].isna()), np.nan)
clean['unit_price']=clean['unit_price'].fillna(clean['unit_price'].median())
clean['discount']=clean['discount'].clip(lower=0,upper=0.9).fillna(0)
clean=clean.drop_duplicates()
clean['revenue']=clean['units']*clean['unit_price']*(1-clean['discount'])
T2=time.time()-start
improve=(T1-T2)/T1*100 if T1>0 else 0
print('Vectorized time:',round(T2,2),'s | Improvement:',round(improve,1),'%')
clean.to_csv('data/clean_orders.csv', index=False)
improve