In [5]:
# ENHANCED PHC SUPPLY CHAIN DATA GENERATOR
# Creates realistic data with proper patterns for ML forecasting
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

print("üè• ENHANCED PHC SUPPLY CHAIN DATA GENERATOR")
print("=" * 50)

# Enhanced medicine list with realistic consumption patterns
medicine_master = [
    # Vital, Fast-moving (High priority, high usage, predictable patterns)
    {"sku_id": "MED001", "sku_name": "Paracetamol Tablet 500mg", "VED_category": "Vital", "FSN_category": "Fast", "unit": "Tablet", "base_consumption": 25, "seasonality": 1.3, "weekend_effect": 0.8},
    {"sku_id": "MED002", "sku_name": "Amoxicillin Capsule 250mg", "VED_category": "Vital", "FSN_category": "Fast", "unit": "Capsule", "base_consumption": 20, "seasonality": 1.4, "weekend_effect": 0.7},
    {"sku_id": "MED003", "sku_name": "Oral Rehydration Salts", "VED_category": "Vital", "FSN_category": "Fast", "unit": "Packet", "base_consumption": 15, "seasonality": 1.5, "weekend_effect": 0.9},
    
    # Essential, Fast-moving (Medium priority, high usage)
    {"sku_id": "MED004", "sku_name": "Ibuprofen Tablet 200mg", "VED_category": "Essential", "FSN_category": "Fast", "unit": "Tablet", "base_consumption": 18, "seasonality": 1.2, "weekend_effect": 0.85},
    {"sku_id": "MED005", "sku_name": "Cetirizine Tablet 5mg", "VED_category": "Essential", "FSN_category": "Fast", "unit": "Tablet", "base_consumption": 12, "seasonality": 1.1, "weekend_effect": 0.9},
    
    # Vital, Slow-moving (High priority, low usage, irregular patterns)
    {"sku_id": "MED006", "sku_name": "Insulin Injection 40 IU/ml", "VED_category": "Vital", "FSN_category": "Slow", "unit": "Vial", "base_consumption": 3, "seasonality": 1.0, "weekend_effect": 1.0},
    {"sku_id": "MED007", "sku_name": "Adrenaline Injection 1 mg/ml", "VED_category": "Vital", "FSN_category": "Slow", "unit": "Ampoule", "base_consumption": 1, "seasonality": 1.0, "weekend_effect": 1.0},
    
    # Essential, Slow-moving (Medium priority, low usage)
    {"sku_id": "MED008", "sku_name": "Omeprazole Capsule 20mg", "VED_category": "Essential", "FSN_category": "Slow", "unit": "Capsule", "base_consumption": 5, "seasonality": 1.0, "weekend_effect": 0.95},
    {"sku_id": "MED009", "sku_name": "Metformin Tablet 500mg", "VED_category": "Essential", "FSN_category": "Slow", "unit": "Tablet", "base_consumption": 8, "seasonality": 1.0, "weekend_effect": 1.0},
    
    # Desirable items (Low priority, various patterns)
    {"sku_id": "MED010", "sku_name": "Multivitamin Tablet", "VED_category": "Desirable", "FSN_category": "Slow", "unit": "Tablet", "base_consumption": 4, "seasonality": 1.0, "weekend_effect": 1.0},
    {"sku_id": "MED011", "sku_name": "Antiseptic Lotion", "VED_category": "Desirable", "FSN_category": "Slow", "unit": "Bottle", "base_consumption": 2, "seasonality": 1.0, "weekend_effect": 1.0},
    {"sku_id": "MED012", "sku_name": "Cotton Roll 100g", "VED_category": "Desirable", "FSN_category": "Fast", "unit": "Roll", "base_consumption": 6, "seasonality": 1.0, "weekend_effect": 1.0}
]

facilities = [
    {"facility_id": "FAC001", "facility_name": "Urban PHC Kalyan", "type": "Urban", "patient_volume": "High"},
    {"facility_id": "FAC002", "facility_name": "Rural PHC Dombivli", "type": "Rural", "patient_volume": "Medium"},
    {"facility_id": "FAC003", "facility_name": "Peri-urban PHC Ambivali", "type": "PeriUrban", "patient_volume": "Medium"}
]

def generate_realistic_consumption(base_consumption, date, fsn_category, seasonality_factor, weekend_effect):
    """Generate realistic consumption patterns with trends, seasonality, and noise"""
    day_of_week = date.weekday()
    month = date.month
    day_of_month = date.day
    
    # Base pattern
    consumption = base_consumption
    
    # Day of week effect (lower on weekends)
    if day_of_week >= 5:  # Weekend
        consumption *= weekend_effect
    
    # Monthly seasonality (higher in monsoon months)
    if month in [6, 7, 8, 9]:  # Jun-Sep (Monsoon)
        consumption *= seasonality_factor
    
    # Random outbreaks (spikes in consumption)
    outbreak_chance = random.random()
    if outbreak_chance > 0.95:  # 5% chance of outbreak
        consumption *= random.uniform(1.5, 3.0)
    
    # Day-of-month effect (higher at month beginning/end)
    if day_of_month <= 7 or day_of_month >= 25:
        consumption *= random.uniform(1.1, 1.3)
    
    # Add noise based on FSN category
    if fsn_category == "Fast":
        noise = random.uniform(0.7, 1.3)  # More variability for fast-moving
    else:
        noise = random.uniform(0.8, 1.2)  # Less variability for slow-moving
    
    consumption *= noise
    
    # Ensure minimum consumption
    return max(1, int(consumption))

def generate_enhanced_inventory_data(num_days=180, start_date="2024-01-01"):
    """Generate enhanced inventory data with realistic patterns"""
    dates = pd.date_range(start=start_date, periods=num_days, freq='D')
    records = []
    
    for date in dates:
        for facility in facilities:
            for medicine in medicine_master:
                # Generate realistic consumption
                daily_usage = generate_realistic_consumption(
                    medicine["base_consumption"],
                    date,
                    medicine["FSN_category"],
                    medicine["seasonality"],
                    medicine["weekend_effect"]
                )
                
                # Facility-specific adjustments
                if facility["patient_volume"] == "High":
                    daily_usage = int(daily_usage * 1.3)
                elif facility["patient_volume"] == "Low":
                    daily_usage = int(daily_usage * 0.7)
                
                # Generate stock levels with realistic patterns
                if medicine["FSN_category"] == "Fast":
                    on_hand = random.randint(100, 500)
                    stock_in_transit = random.randint(0, 200) if random.random() > 0.7 else 0
                else:
                    on_hand = random.randint(20, 100)
                    stock_in_transit = random.randint(0, 50) if random.random() > 0.8 else 0
                
                # Lead time varies by facility type and medicine category
                if facility["type"] == "Urban":
                    lead_time = random.randint(2, 5)
                elif facility["type"] == "Rural":
                    lead_time = random.randint(5, 10)
                else:  # PeriUrban
                    lead_time = random.randint(3, 7)
                
                # Expiry dates with realistic distribution
                days_to_expiry_options = [30, 60, 90, 180, 365, 730]
                weights = [0.05, 0.1, 0.15, 0.3, 0.3, 0.1]  # More near-expiry for realism
                days_to_expiry = random.choices(days_to_expiry_options, weights=weights)[0]
                batch_expiry = date + timedelta(days=days_to_expiry)
                
                # Patient admissions (correlated with medicine usage)
                base_admissions = 50
                if facility["patient_volume"] == "High":
                    base_admissions = 100
                elif facility["patient_volume"] == "Low":
                    base_admissions = 30
                
                patient_admissions = max(10, int(base_admissions * random.uniform(0.7, 1.3)))
                
                # Holiday pattern
                is_holiday = 1 if (date.weekday() in [5, 6] and random.random() > 0.3) else 0
                
                # Price with realistic variation
                if medicine["VED_category"] == "Vital":
                    price = round(random.uniform(5.0, 50.0), 2)
                elif medicine["VED_category"] == "Essential":
                    price = round(random.uniform(2.0, 20.0), 2)
                else:
                    price = round(random.uniform(1.0, 10.0), 2)
                
                records.append({
                    "date": date,
                    "facility_id": facility["facility_id"],
                    "facility_name": facility["facility_name"],
                    "ward_id": f"W{random.randint(1, 3):02d}",
                    "sku_id": medicine["sku_id"],
                    "sku_name": medicine["sku_name"],
                    "units_used": daily_usage,
                    "on_hand": on_hand,
                    "stock_in_transit": stock_in_transit,
                    "lead_time_days": lead_time,
                    "batch_id": f"B{medicine['sku_id'][3:]}{random.randint(1000, 9999)}",
                    "batch_expiry_date": batch_expiry,
                    "price_per_unit": price,
                    "is_holiday": is_holiday,
                    "patient_admissions": patient_admissions,
                    "VED_category": medicine["VED_category"],
                    "FSN_category": medicine["FSN_category"]
                })
    
    return pd.DataFrame(records)

# Generate enhanced data
print("üìä Generating enhanced PHC inventory data...")
df = generate_enhanced_inventory_data(num_days=180)  # 6 months of rich data
print(f"‚úÖ Generated {len(df):,} records with realistic patterns")

# Save data
df.to_csv('data/sample_inventory.csv', index=False)
print("‚úÖ Saved enhanced data to: data/sample_inventory.csv")

# Save medicine master
medicine_df = pd.DataFrame(medicine_master)
medicine_df.to_csv('data/medicine_master.csv', index=False)
print("‚úÖ Saved medicine master to: data/medicine_master.csv")

# Data quality summary
print(f"\nüìà ENHANCED DATA SUMMARY:")
print(f"‚Ä¢ Total records: {len(df):,}")
print(f"‚Ä¢ Date range: {df['date'].min().date()} to {df['date'].max().date()}")
print(f"‚Ä¢ Facilities: {df['facility_id'].nunique()}")
print(f"‚Ä¢ Medicines: {df['sku_id'].nunique()}")
print(f"‚Ä¢ Total consumption: {df['units_used'].sum():,} units")

# Consumption patterns by FSN category
print(f"\nüìä CONSUMPTION PATTERNS:")
fsn_consumption = df.groupby('FSN_category')['units_used'].mean()
for fsn, avg_usage in fsn_consumption.items():
    print(f"‚Ä¢ {fsn}-moving: {avg_usage:.1f} units/day")

print(f"\nüéØ DATA ENHANCEMENTS APPLIED:")
print("‚Ä¢ Realistic seasonality patterns (monsoon spikes)")
print("‚Ä¢ Weekend/weekday effects")
print("‚Ä¢ Random outbreak simulations")
print("‚Ä¢ Facility-specific patient volumes")
print("‚Ä¢ Proper FSN category consumption patterns")
print("‚Ä¢ Realistic lead time variations")
print("‚Ä¢ Meaningful expiry date distribution")

üè• ENHANCED PHC SUPPLY CHAIN DATA GENERATOR
üìä Generating enhanced PHC inventory data...
‚úÖ Generated 6,480 records with realistic patterns
‚úÖ Saved enhanced data to: data/sample_inventory.csv
‚úÖ Saved medicine master to: data/medicine_master.csv

üìà ENHANCED DATA SUMMARY:
‚Ä¢ Total records: 6,480
‚Ä¢ Date range: 2024-01-01 to 2024-06-28
‚Ä¢ Facilities: 3
‚Ä¢ Medicines: 12
‚Ä¢ Total consumption: 76,647 units

üìä CONSUMPTION PATTERNS:
‚Ä¢ Fast-moving: 19.5 units/day
‚Ä¢ Slow-moving: 4.2 units/day

üéØ DATA ENHANCEMENTS APPLIED:
‚Ä¢ Realistic seasonality patterns (monsoon spikes)
‚Ä¢ Weekend/weekday effects
‚Ä¢ Random outbreak simulations
‚Ä¢ Facility-specific patient volumes
‚Ä¢ Proper FSN category consumption patterns
‚Ä¢ Realistic lead time variations
‚Ä¢ Meaningful expiry date distribution


In [33]:
# Load WITHOUT parsing dates first
df_raw = pd.read_csv('data/cleaned_inventory.csv')

# Find problematic date strings
problematic_dates = []
for idx, date_str in enumerate(df_raw['date']):
    try:
        pd.to_datetime(date_str)
    except Exception as e:
        problematic_dates.append((idx, date_str, str(e)))

print(f"Found {len(problematic_dates)} problematic date strings")
for idx, date_str, error in problematic_dates:
    print(f"  Row {idx}: '{date_str}' -> {error}")

  pd.to_datetime(date_str)


Found 0 problematic date strings


In [7]:
import pandas as pd

# Step 1: Load the CLEANED data (no errors argument here)
df = pd.read_csv('data/cleaned_inventory.csv')

# Step 2: Explicitly convert date columns to datetime with coercion
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)
df['batch_expiry_date'] = pd.to_datetime(df['batch_expiry_date'], errors='coerce', dayfirst=True)

# Step 3: Find rows where 'date' failed to parse (NaT)
nan_dates = df[df['date'].isna()]
print(f"Rows with unparseable dates: {len(nan_dates)}")

if len(nan_dates) > 0:
    print("\nFirst few problematic rows:")
    # Show the parsed NaT plus other context columns
    problematic_sample = df.loc[df['date'].isna(), 
                                ['date', 'batch_expiry_date', 'sku_id', 'facility_id']].head()
    print(problematic_sample)

    # Step 4: Inspect the raw strings that failed
    print("\nSample of raw date strings that failed:")
    df_raw = pd.read_csv('data/cleaned_inventory.csv')  # re-read without parsing
    problematic_indices = nan_dates.index
    for idx in problematic_indices[:10]:  # show first 10 only
        print(f"Row {idx}: '{df_raw.loc[idx, 'date']}'")

Rows with unparseable dates: 0


In [9]:
import pandas as pd

# 1. Check what's ACTUALLY in the CSV file
print("üîç CHECKING ACTUAL CSV CONTENT")
print("=" * 50)

# Read WITHOUT any date parsing
df_raw = pd.read_csv('data/cleaned_inventory.csv')
print(f"Column 'date' dtype: {df_raw['date'].dtype}")
print(f"First 5 date values: {df_raw['date'].head().tolist()}")
print(f"Last 5 date values: {df_raw['date'].tail().tolist()}")

# 2. Try parsing WITH dayfirst=True
print("\n\nüîß PARSING WITH dayfirst=True")
print("=" * 50)

df_parsed = pd.read_csv('data/cleaned_inventory.csv', parse_dates=['date'], dayfirst=True)
print(f"After parsing - 'date' dtype: {df_parsed['date'].dtype}")
print(f"Missing dates after parsing: {df_parsed['date'].isna().sum()}")

# 3. Show date range if successful
if df_parsed['date'].isna().sum() == 0:
    print(f"Date range: {df_parsed['date'].min()} to {df_parsed['date'].max()}")
    print(f"Unique dates: {df_parsed['date'].nunique()}")

üîç CHECKING ACTUAL CSV CONTENT
Column 'date' dtype: object
First 5 date values: ['01-01-2024', '01-01-2024', '01-01-2024', '01-01-2024', '01-01-2024']
Last 5 date values: ['28-06-2024', '28-06-2024', '28-06-2024', '28-06-2024', '28-06-2024']


üîß PARSING WITH dayfirst=True
After parsing - 'date' dtype: datetime64[ns]
Missing dates after parsing: 0
Date range: 2024-01-01 00:00:00 to 2024-06-28 00:00:00
Unique dates: 180


In [11]:
import pandas as pd
import numpy as np

# Read the raw sample inventory (BEFORE your cleaning)
print("üîç INVESTIGATING RAW DATA FOR MALFORMED DATES")
print("=" * 60)

df_raw = pd.read_csv('data/sample_inventory.csv')
print(f"Total rows: {len(df_raw)}")

# Check for date strings that don't match DD-MM-YYYY pattern
def is_valid_date_string(s):
    try:
        # Basic pattern check: should be exactly 10 characters, with dashes
        if not isinstance(s, str) or len(s) != 10 or s[2] != '-' or s[5] != '-':
            return False
        # Try to parse it
        pd.to_datetime(s, dayfirst=True, format='%d-%m-%Y')
        return True
    except:
        return False

# Find invalid date strings
invalid_dates = []
for idx, date_str in enumerate(df_raw['date']):
    if not is_valid_date_string(str(date_str)):
        invalid_dates.append((idx, date_str))

print(f"\nFound {len(invalid_dates)} potentially malformed date strings")

if len(invalid_dates) > 0:
    print("\nüî¥ FIRST 10 MALFORMED DATES:")
    for idx, date_str in invalid_dates[:10]:
        print(f"  Row {idx}: '{date_str}' (length: {len(str(date_str))})")
    
    # Show the full rows for context
    print("\nüìã CONTEXT FOR FIRST 5 MALFORMED ROWS:")
    malformed_indices = [idx for idx, _ in invalid_dates[:5]]
    display(df_raw.loc[malformed_indices, ['date', 'sku_id', 'facility_id']])

üîç INVESTIGATING RAW DATA FOR MALFORMED DATES
Total rows: 6480

Found 6480 potentially malformed date strings

üî¥ FIRST 10 MALFORMED DATES:
  Row 0: '2024-01-01' (length: 10)
  Row 1: '2024-01-01' (length: 10)
  Row 2: '2024-01-01' (length: 10)
  Row 3: '2024-01-01' (length: 10)
  Row 4: '2024-01-01' (length: 10)
  Row 5: '2024-01-01' (length: 10)
  Row 6: '2024-01-01' (length: 10)
  Row 7: '2024-01-01' (length: 10)
  Row 8: '2024-01-01' (length: 10)
  Row 9: '2024-01-01' (length: 10)

üìã CONTEXT FOR FIRST 5 MALFORMED ROWS:


Unnamed: 0,date,sku_id,facility_id
0,2024-01-01,MED001,FAC001
1,2024-01-01,MED002,FAC001
2,2024-01-01,MED003,FAC001
3,2024-01-01,MED004,FAC001
4,2024-01-01,MED005,FAC001
