In [None]:
# NHSRC PHC SUPPLY CHAIN - DATA CLEANING PIPELINE
import pandas as pd
import numpy as np

print("üßπ NHSRC PHC DATA CLEANING PIPELINE")
print("=" * 50)

In [None]:
# 1. IMPORT AND BASIC READ
print("üìä STEP 1: Importing Data")
print("-" * 40)

inventory = pd.read_csv("C:\Users\wicra\Projects\phc-supply-chain-ai\data\sample_inventory.csv")
master = pd.read_csv("C:\Users\wicra\Projects\phc-supply-chain-ai\data\medicine_master.csv")

print("üì¶ INVENTORY DATA:")
print(f"Records: {len(inventory):,}")
print(f"Columns: {list(inventory.columns)}")
display(inventory.head())

print("\nüè• MASTER DATA:")
print(f"Records: {len(master):,}")
print(f"Columns: {list(master.columns)}")
display(master.head())

In [None]:
# 2. STANDARDIZE COLUMN NAMES
print("\nüìù STEP 2: Standardizing Column Names")
print("-" * 40)

print("BEFORE standardization:")
print(f"Inventory columns: {list(inventory.columns)}")
print(f"Master columns: {list(master.columns)}")

# Standardize column names
inventory.columns = (
    inventory.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

master.columns = (
    master.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

print("\nAFTER standardization:")
print(f"Inventory columns: {list(inventory.columns)}")
print(f"Master columns: {list(master.columns)}")

In [None]:
# 3. FIX DATE FORMATS
print("\nüìÖ STEP 3: Fixing Date Formats")
print("-" * 40)

print("Date columns before conversion:")
print(f"date dtype: {inventory['date'].dtype}")
print(f"batch_expiry_date dtype: {inventory['batch_expiry_date'].dtype}")

# Convert dates to ISO format
inventory['date'] = pd.to_datetime(inventory['date'])
inventory['batch_expiry_date'] = pd.to_datetime(inventory['batch_expiry_date'])

print("\nDate columns after conversion:")
print(f"date dtype: {inventory['date'].dtype}")
print(f"batch_expiry_date dtype: {inventory['batch_expiry_date'].dtype}")
print(f"Date range: {inventory['date'].min()} to {inventory['date'].max()}")

In [None]:
# 4. HANDLE MISSING VALUES
print("\nüîç STEP 4: Handling Missing Values")
print("-" * 40)

print("Missing values BEFORE handling:")
missing_before = inventory.isna().sum()
print(missing_before[missing_before > 0])

# Handle missing values based on column type
# IDs: Drop if missing (none in our case)
# Units Used: Replace with 0
inventory['units_used'] = inventory['units_used'].fillna(0)

# Stock: Forward fill by SKU
inventory['on_hand'] = inventory.groupby('sku_id')['on_hand'].fillna(method='ffill')

# Lead Time: Fill with median by SKU
inventory['lead_time_days'] = inventory.groupby('sku_id')['lead_time_days'].transform(
    lambda x: x.fillna(x.median())
)

print("\nMissing values AFTER handling:")
missing_after = inventory.isna().sum()
print(missing_after[missing_after > 0])

In [None]:
# 5. STANDARDIZE SKU ‚Üí MASTER MERGE CHECK
print("\nüîó STEP 5: SKU-Master Merge Validation")
print("-" * 40)

# Merge with master data
merged = inventory.merge(master[['sku_id', 'ved_category', 'fsn_category']], on="sku_id", how="left")

# Check for missing master records
missing_master = merged[merged['ved_category'].isna()]
print(f"Records missing master data: {len(missing_master)}")

if len(missing_master) > 0:
    print("Missing master records:")
    display(missing_master[['sku_id', 'sku_name']].drop_duplicates())
else:
    print("‚úÖ All SKUs have corresponding master data")

# Use the merged data
inventory = merged

In [None]:
# 6. OUTLIER DETECTION
print("\nüö® STEP 6: Outlier Detection")
print("-" * 40)

# Flag outliers: demand > mean + 3*std
inventory['is_outlier'] = (
    inventory.groupby('sku_id')['units_used']
    .transform(lambda x: (x > x.mean() + 3 * x.std()))
)

outlier_summary = inventory.groupby('sku_id')['is_outlier'].sum()
print("Outlier count by SKU:")
for sku, count in outlier_summary[outlier_summary > 0].items():
    print(f"  {sku}: {count} outliers")

total_outliers = inventory['is_outlier'].sum()
print(f"\nTotal outliers flagged: {total_outliers} (not removed - important for outbreak signals)")

In [None]:
# 7. COMPUTE DERIVED FIELDS
print("\nüßÆ STEP 7: Computing Derived Fields")
print("-" * 40)

# Days Cover: Stock adequacy (7-day rolling average)
inventory['days_cover'] = inventory['on_hand'] / (inventory['units_used'].rolling(7).mean().replace(0, np.nan))
inventory['days_cover'] = inventory['days_cover'].fillna(365)  # Fill infinite values

# Expiry days remaining
inventory['expiry_days_remaining'] = (inventory['batch_expiry_date'] - inventory['date']).dt.days

# Expiry risk bucket (NHSRC compliance)
inventory['expiry_risk_bucket'] = pd.cut(
    inventory['expiry_days_remaining'],
    bins=[-999, 30, 90, 180, 9999],
    labels=["CRITICAL (<30d)", "HIGH (30‚Äì90d)", "MEDIUM (90‚Äì180d)", "LOW (>180d)"]
)

print("‚úÖ Derived fields computed:")
print(f"  - Days Cover: Stock adequacy metric")
print(f"  - Expiry Days Remaining: {inventory['expiry_days_remaining'].min()} to {inventory['expiry_days_remaining'].max()} days")
print(f"  - Expiry Risk Bucket: NHSRC compliance categories")

# Show expiry risk distribution
expiry_dist = inventory['expiry_risk_bucket'].value_counts()
print("\nüìä Expiry Risk Bucket Distribution:")
for risk, count in expiry_dist.items():
    print(f"  {risk}: {count} records")

In [None]:
# 8. DATA QUALITY SUMMARY
print("\nüìã STEP 8: Data Quality Summary")
print("-" * 40)

report = {
    "total_records": len(inventory),
    "date_range": f"{inventory['date'].min().date()} to {inventory['date'].max().date()}",
    "unique_facilities": inventory['facility_id'].nunique(),
    "unique_skus": inventory['sku_id'].nunique(),
    "missing_values_total": inventory.isna().sum().sum(),
    "outlier_count": int(inventory['is_outlier'].sum()),
    "ved_distribution": inventory['ved_category'].value_counts().to_dict(),
    "fsn_distribution": inventory['fsn_category'].value_counts().to_dict(),
    "expiry_risk_distribution": inventory['expiry_risk_bucket'].value_counts().to_dict()
}

print("üìà DATA QUALITY REPORT:")
for key, value in report.items():
    print(f"  {key}: {value}")

In [None]:
# 9. SAVE CLEANED DATASET
print("\nüíæ STEP 9: Saving Cleaned Dataset")
print("-" * 40)

inventory.to_csv("data/cleaned_inventory.csv", index=False)

print("‚úÖ Cleaned data saved to: data/cleaned_inventory.csv")
print(f"üìÅ File info:")
print(f"  - Records: {len(inventory):,}")
print(f"  - Columns: {len(inventory.columns)}")
print(f"  - Size: {inventory.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

# Display first 5 rows of cleaned data
print("\nüîπ FIRST 5 ROWS OF CLEANED_INVENTORY.CSV:")
display(inventory.head())

In [None]:
# 10. GENERATE DATA QUALITY REPORT
print("\nüìÑ STEP 10: Generating Data Quality Report")
print("-" * 40)

report_content = f"""# NHSRC PHC Supply Chain - Data Quality Report

## Dataset Overview
- **Total Records**: {report['total_records']:,}
- **Date Range**: {report['date_range']}
- **Unique Facilities**: {report['unique_facilities']}
- **Unique SKUs**: {report['unique_skus']}

## Data Quality Metrics
- **Missing Values**: {report['missing_values_total']}
- **Outliers Flagged**: {report['outlier_count']}

## Category Distributions

### VED Category Distribution
{pd.Series(report['ved_distribution'])}

### FSN Category Distribution  
{pd.Series(report['fsn_distribution'])}

### Expiry Risk Distribution
{pd.Series(report['expiry_risk_distribution'])}

## Cleaning Operations Applied
1. Column name standardization
2. Date format conversion
3. Missing value imputation
4. Outlier detection (flagged, not removed)
5. Derived field computation
6. Master data validation

## Notes
- Outliers are flagged but not removed to preserve outbreak signals
- All SKUs have corresponding master data
- Data is NHSRC-compliant and ready for analysis

**Generated**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
"""

# Save report
with open('docs/data_quality_report.md', 'w') as f:
    f.write(report_content)

print("‚úÖ Data quality report saved to: docs/data_quality_report.md")