# 01 – Data Cleaning
**Purpose:**
1. Load raw data
2. Profile & inspect basic stats
3. Clean nulls, duplicates, bad timestamps, negatives
4. Save cleaned dataset & generate schema report


In [24]:
import pandas as pd
from pathlib import Path

ROOT = Path.cwd().parent
RAW_DIR = ROOT / "data" / "raw"
CLEAN_DIR = ROOT / "data" / "clean"
REPORTS_DIR = ROOT / "reports"

CLEAN_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)



In [25]:
transactions = pd.read_csv(RAW_DIR / "transactions_log.csv")
print(transactions.columns.tolist())
display(transactions.head())


['TransactionID', 'CustomerID', 'Date', 'SKU', 'Quantity']


Unnamed: 0,TransactionID,CustomerID,Date,SKU,Quantity
0,PO-2024-10000,B2B-2734,2024-01-01,1128,2
1,PO-2024-10000,B2B-2734,2024-01-01,1275,2
2,PO-2024-10001,B2B-2154,2024-01-01,1078,19
3,PO-2024-10002,B2B-2398,2024-01-01,1270,1
4,PO-2024-10002,B2B-2398,2024-01-01,1143,3


In [26]:
transactions = pd.read_csv(
    RAW_DIR / "transactions_log.csv",
    parse_dates=["Date"]
)
customers = pd.read_csv(RAW_DIR / "customers.csv")
products  = pd.read_csv(RAW_DIR / "products_catalog.csv")

display(transactions.head())
transactions.info()


Unnamed: 0,TransactionID,CustomerID,Date,SKU,Quantity
0,PO-2024-10000,B2B-2734,2024-01-01,1128,2
1,PO-2024-10000,B2B-2734,2024-01-01,1275,2
2,PO-2024-10001,B2B-2154,2024-01-01,1078,19
3,PO-2024-10002,B2B-2398,2024-01-01,1270,1
4,PO-2024-10002,B2B-2398,2024-01-01,1143,3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   TransactionID  50000 non-null  object        
 1   CustomerID     50000 non-null  object        
 2   Date           50000 non-null  datetime64[ns]
 3   SKU            50000 non-null  int64         
 4   Quantity       50000 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 1.9+ MB


In [27]:
print("Rows, Cols:", transactions.shape)
print(transactions.describe(include="all"))

missing = transactions.isna().mean() * 100
print("\nMissing % per column:")
print(missing)


Rows, Cols: (50000, 5)
        TransactionID CustomerID                        Date           SKU  \
count           50000      50000                       50000  50000.000000   
unique          10954        800                         NaN           NaN   
top     PO-2028-19291   B2B-2754                         NaN           NaN   
freq                8        144                         NaN           NaN   
mean              NaN        NaN  2026-10-20 07:38:15.936000   1174.371020   
min               NaN        NaN         2024-01-01 00:00:00   1001.000000   
25%               NaN        NaN         2025-05-15 00:00:00   1065.000000   
50%               NaN        NaN         2026-10-19 00:00:00   1173.000000   
75%               NaN        NaN         2028-03-22 00:00:00   1270.000000   
max               NaN        NaN         2029-08-12 00:00:00   1370.000000   
std               NaN        NaN                         NaN    115.750807   

            Quantity  
count   50000.000

In [22]:
before = len(transactions)

transactions.drop_duplicates(inplace=True)

bad_ts = transactions["Date"] > pd.Timestamp.today()
transactions = transactions[~bad_ts]

neg_qty = (transactions["Quantity"] < 0).sum()
transactions = transactions[transactions["Quantity"] >= 0]

print("Total rows before:", before)
print("Duplicates removed:", before - len(transactions) - bad_ts.sum() - neg_qty)
print("Future dates removed:", bad_ts.sum())
print("Negative quantities removed:", neg_qty)


Total rows before: 50000
Duplicates removed: 0
Future dates removed: 35767
Negative quantities removed: 0


In [28]:
transactions.to_csv(CLEAN_DIR / "cleaned_transactions_v1.csv", index=False)

report = [
    "# Schema Validation Report",
    f"- Total rows ingested: {before:,}",
    f"- Rows after cleaning: {len(transactions):,}",
    f"- Duplicates removed: {before - len(transactions) - bad_ts.sum() - neg_qty:,}",
    f"- Future timestamps removed: {bad_ts.sum():,}",
    f"- Negative quantities removed: {neg_qty:,}"
]


print("Cleaned data saved and report generated.")


Cleaned data saved and report generated.


In [33]:
total_before = before
future_removed = bad_ts.sum()
neg_removed = neg_qty
dups_removed = total_before - len(transactions) - future_removed - neg_removed
total_after = len(transactions)

trans_report = [
    "# Transactions Cleaning Report",
    f"- Total rows ingested: {total_before:,}",
    f"- Total rows after cleaning: {total_after:,}",
    f"- Duplicates removed: {dups_removed:,}",
    f"- Future‐dated rows removed: {future_removed:,}",
    f"- Negative‐quantity rows removed: {neg_removed:,}"
]

from pathlib import Path
REPORTS_DIR = Path.cwd().parent / "reports"
with open(REPORTS_DIR / "transactions_report.md", "w") as f:
    f.write("\n".join(trans_report))


In [34]:
dup_before = before
dup_after  = len(transactions)
print("True duplicates removed:", dup_before - dup_after)  # should be 0


True duplicates removed: 0


In [29]:
customers = pd.read_csv(RAW_DIR / "customers.csv")
print(customers.columns.tolist())
display(customers.head())
customers.info()

missing = customers.isna().mean() * 100
print("\nMissing % per column:")
print(missing)


['CustomerID', 'Business_Category', 'Business_Size', 'Customer_Since']


Unnamed: 0,CustomerID,Business_Category,Business_Size,Customer_Since
0,B2B-2013,Legal Services,Medium,2023-01-26
1,B2B-2310,Tech Startup,Medium,2022-03-08
2,B2B-2782,Tech Startup,Small,2023-09-01
3,B2B-2215,Consulting,Small,2022-10-27
4,B2B-2436,Education,Small,2022-06-24


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CustomerID         800 non-null    object
 1   Business_Category  800 non-null    object
 2   Business_Size      800 non-null    object
 3   Customer_Since     800 non-null    object
dtypes: object(4)
memory usage: 25.1+ KB

Missing % per column:
CustomerID           0.0
Business_Category    0.0
Business_Size        0.0
Customer_Since       0.0
dtype: float64


In [30]:
customers["Customer_Since"] = pd.to_datetime(customers["Customer_Since"], format="%Y-%m-%d")
customers["Business_Category"] = customers["Business_Category"].astype("category")
customers["Business_Size"]     = customers["Business_Size"].astype("category")


In [31]:
print("Duplicate CustomerIDs:", customers.duplicated(subset=["CustomerID"]).sum())

print("\nBusiness_Size levels:", customers["Business_Size"].cat.categories.tolist())
print("Business_Category levels:", customers["Business_Category"].cat.categories.tolist())


Duplicate CustomerIDs: 0

Business_Size levels: ['Medium', 'Small']
Business_Category levels: ['Construction', 'Consulting', 'Education', 'Legal Services', 'Tech Startup']


In [35]:
customers.to_csv(CLEAN_DIR / "cleaned_customers_v1.csv", index=False)

with open(REPORTS_DIR / "Customers_report.md", "a") as f:
    f.write("\n\n# Customers Table\n")
    f.write(f"- Parsed `Customer_Since` to datetime\n")
    f.write(f"- Business_Category levels: {customers['Business_Category'].cat.categories.tolist()}\n")
    f.write(f"- Business_Size levels: {customers['Business_Size'].cat.categories.tolist()}\n")
    f.write(f"- Duplicate CustomerID rows dropped: 0\n")


In [36]:
products = pd.read_csv(RAW_DIR / "products_catalog.csv")
print(products.columns.tolist())
display(products.head())
products.info()

missing = products.isna().mean() * 100
print("\nMissing % per column:")
print(missing)


['SKU', 'Rev_GL_Class', 'Sub_Category', 'Item_Description', 'Brand', 'Unit_Price', 'Attributes']


Unnamed: 0,SKU,Rev_GL_Class,Sub_Category,Item_Description,Brand,Unit_Price,Attributes
0,1001,OFFICE SUPPLIES,Paper,"PaperOne Paper - A4, 70",PaperOne,37.32,"{""Size"": ""A4"", ""Weight_gsm"": 70}"
1,1002,OFFICE SUPPLIES,Paper,"PaperOne Paper - A4, 80",PaperOne,36.51,"{""Size"": ""A4"", ""Weight_gsm"": 80}"
2,1003,OFFICE SUPPLIES,Paper,"PaperOne Paper - A3, 70",PaperOne,34.99,"{""Size"": ""A3"", ""Weight_gsm"": 70}"
3,1004,OFFICE SUPPLIES,Paper,"PaperOne Paper - A3, 80",PaperOne,61.56,"{""Size"": ""A3"", ""Weight_gsm"": 80}"
4,1005,OFFICE SUPPLIES,Paper,"Double A Paper - A4, 70",Double A,68.41,"{""Size"": ""A4"", ""Weight_gsm"": 70}"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SKU               370 non-null    int64  
 1   Rev_GL_Class      370 non-null    object 
 2   Sub_Category      370 non-null    object 
 3   Item_Description  370 non-null    object 
 4   Brand             370 non-null    object 
 5   Unit_Price        370 non-null    float64
 6   Attributes        370 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 20.4+ KB

Missing % per column:
SKU                 0.0
Rev_GL_Class        0.0
Sub_Category        0.0
Item_Description    0.0
Brand               0.0
Unit_Price          0.0
Attributes          0.0
dtype: float64


In [38]:
products["Unit_Price"] = pd.to_numeric(products["Unit_Price"], errors="raise")
products["Rev_GL_Class"] = products["Rev_GL_Class"].astype("category")
products["Sub_Category"] = products["Sub_Category"].astype("category")
products["Brand"]       = products["Brand"].astype("category")


In [39]:
before_p = len(products)
products.drop_duplicates(subset=["SKU"], inplace=True)
dup_removed_p = before_p - len(products)


In [45]:
products.to_csv(CLEAN_DIR / "cleaned_products_v1.csv", index=False)

prod_report = [
    "# Products Catalog Cleaning Report",
    f"- Total rows ingested: {before_p:,}",
    f"- Total rows after cleaning: {len(products):,}",
    f"- Duplicate SKUs removed: {dup_removed_p:,}",
    "- Final dtypes for key columns:",
    f"  - Size: {products['Size'].dtype}",
    f"  - Weight_gsm: {products['Weight_gsm'].dtype}",
    f"  - Color: {products['Color'].dtype}",
    f"  - Page_Yield: {products['Page_Yield'].dtype}",
    "- Missing values per column:",
]
for col, pct in (products.isna().mean() * 100).items():
    prod_report.append(f"  - {col}: {pct:.2f}%")

with open(REPORTS_DIR / "products_report.md", "w") as f:
    f.write("\n".join(prod_report))


In [41]:

products = pd.read_csv(RAW_DIR / "products_catalog.csv")


print("Before conversions:\n", products.dtypes, "\n")


products["Unit_Price"]       = pd.to_numeric(products["Unit_Price"], errors="raise")
products["Rev_GL_Class"]     = products["Rev_GL_Class"].astype("category")
products["Sub_Category"]     = products["Sub_Category"].astype("category")
products["Brand"]            = products["Brand"].astype("category")
products["Item_Description"] = products["Item_Description"].astype("string")


print("After conversions:\n", products.dtypes)


Before conversions:
 SKU                   int64
Rev_GL_Class         object
Sub_Category         object
Item_Description     object
Brand                object
Unit_Price          float64
Attributes           object
dtype: object 

After conversions:
 SKU                          int64
Rev_GL_Class              category
Sub_Category              category
Item_Description    string[python]
Brand                     category
Unit_Price                 float64
Attributes                  object
dtype: object


In [44]:
import pandas as pd

products["Size"] = products["Size"].astype("category")
products["Color"] = products["Color"].astype("category")
products["Unit_Price"] = pd.to_numeric(products["Unit_Price"], errors="raise")
products["Page_Yield"] = pd.to_numeric(products["Page_Yield"], errors="raise")

products["Weight_gsm"] = pd.to_numeric(products["Weight_gsm"], errors="coerce").astype("Int64")
print(products.dtypes[["Size","Weight_gsm","Color","Page_Yield"]])


Size          category
Weight_gsm       Int64
Color         category
Page_Yield     float64
dtype: object


In [50]:
products["Size"] = products["Size"].fillna("Unknown").astype("category")
products["Color"] = products["Color"].fillna("Unknown").astype("category")
products["Weight_gsm"] = pd.to_numeric(products["Weight_gsm"], errors="coerce").fillna(0).astype("Int64")
products["Page_Yield"] = pd.to_numeric(products["Page_Yield"], errors="coerce").fillna(0).astype(int)

products.to_csv(CLEAN_DIR / "cleaned_products_v1.csv", index=False)


In [49]:
products.dropna(axis=1, how="all", inplace=True)
products.to_csv(CLEAN_DIR / "cleaned_products_v1.csv", index=False)
