In [None]:
# For creating Dataset used initially
import pandas as pd
import numpy as np
from datetime import timedelta

def create_synthetic_drug_demand_data():
    """
    Generates a synthetic dataset for drug demand forecasting in Kerala.

    The function simulates daily stock movements for multiple drugs across several warehouses,
    incorporating trend, multiple seasonalities (annual and weekly), and base demand
    linked to district population and health infrastructure.
    """

    # ==============================================================================
    # --- CONFIGURATION & LEVERS ---
    # You can change all the parameters in this section.
    # ==============================================================================

    # 1. Date Range
    start_date = "2021-01-01"
    end_date = "2023-12-31"

    # 2. Drugs & ATC Categories
    # ATC (Anatomical Therapeutic Chemical) classification for grouping.
    drugs_config = {
        "A: Alimentary tract and metabolism": ["Metformin", "Omeprazole", "Domperidone", "Lactulose"],
        "C: Cardiovascular system": ["Amlodipine", "Atorvastatin", "Clopidogrel", "Metoprolol"],
        "J: Antiinfectives for systemic use": ["Amoxicillin", "Doxycycline", "Azithromycin", "Ciprofloxacin"],
        "N: Nervous system": ["Paracetamol", "Diclofenac", "Gabapentin", "Sertraline"],
        "R: Respiratory system": ["Salbutamol", "Montelukast", "Cetirizine", "Budesonide"]
    }

    # 3. Warehouses & District Data (Population and Health Institutions)
    # Population and institution counts act as weights for base demand.
    # Data is approximate for demonstration purposes.
    warehouses_config = {
        "Thiruvananthapuram": {"population": 3300000, "institutions": 120},
        "Kollam":             {"population": 2630000, "institutions": 90},
        "Pathanamthitta":     {"population": 1190000, "institutions": 60},
        "Alappuzha":          {"population": 2120000, "institutions": 85},
        "Kottayam":           {"population": 1970000, "institutions": 75},
        "Idukki":             {"population": 1100000, "institutions": 55},
        "Ernakulam":          {"population": 3280000, "institutions": 130},
        "Thrissur":           {"population": 3120000, "institutions": 110},
        "Palakkad":           {"population": 2810000, "institutions": 95},
        "Malappuram":         {"population": 4110000, "institutions": 140},
        "Kozhikode":          {"population": 3080000, "institutions": 115},
        "Wayanad":            {"population": 817000,  "institutions": 50},
        "Kannur":             {"population": 2520000, "institutions": 100},
        "Kasaragod":          {"population": 1300000, "institutions": 65},
    }

    # 4. Demand Simulation Parameters
    base_demand_per_drug = {drug: np.random.randint(50, 150) for category in drugs_config.values() for drug in category}
    overall_trend_multiplier = 0.0005  # Slight daily upward trend (0.05% per day)
    weekly_demand_factor = {0: 1.0, 1: 1.1, 2: 1.05, 3: 1.15, 4: 0.95, 5: 0.7, 6: 0.65} # Mon=0, Sun=6
    random_noise_factor = 0.15 # 15% random fluctuation

    # 5. Seasonality Configuration
    monsoon_months = [6, 7, 8, 9] # June to September
    summer_months = [3, 4, 5]     # March to May

    # Define which drug categories are affected by seasons
    monsoon_impact_categories = ["J: Antiinfectives for systemic use", "R: Respiratory system"]
    summer_impact_categories = ["A: Alimentary tract and metabolism"] # e.g., for dehydration, digestion issues
    seasonality_impact_multiplier = 1.6 # Demand increases by 60% during peak season

    # 6. Inventory Management Parameters
    initial_stock_multiplier = 60  # Initial stock is 60 days of average base demand
    monthly_stock_in_days_target = 45 # Replenish to have 45 days of stock cover


    # ==============================================================================
    # --- DATA GENERATION LOGIC ---
    # You generally don't need to change the code below.
    # ==============================================================================

    print("Starting synthetic data generation...")

    # Flatten drug list and create drug-to-category mapping
    all_drugs = [drug for category_drugs in drugs_config.values() for drug in category_drugs]
    drug_to_category_map = {drug: category for category, drugs in drugs_config.items() for drug in drugs}

    # Create the date range
    dates = pd.to_datetime(pd.date_range(start=start_date, end=end_date, freq='D'))

    # Create the base DataFrame with all combinations of dates, warehouses, and drugs
    df_list = []
    for warehouse in warehouses_config.keys():
        for drug in all_drugs:
            temp_df = pd.DataFrame({'Date': dates})
            temp_df['Warehouse'] = warehouse
            temp_df['Drug'] = drug
            df_list.append(temp_df)

    df = pd.concat(df_list, ignore_index=True)
    df['ATC_Category'] = df['Drug'].map(drug_to_category_map)

    # --- 1. Simulate Demand (Stock Out) ---
    print("Simulating demand (Stock Out)...")

    # a. Calculate base demand based on population and institutions
    total_population = sum(w['population'] for w in warehouses_config.values())
    total_institutions = sum(w['institutions'] for w in warehouses_config.values())

    df['warehouse_pop_factor'] = df['Warehouse'].apply(lambda w: warehouses_config[w]['population'] / total_population)
    df['warehouse_inst_factor'] = df['Warehouse'].apply(lambda w: warehouses_config[w]['institutions'] / total_institutions)
    df['base_demand'] = df['Drug'].map(base_demand_per_drug) * (df['warehouse_pop_factor'] + df['warehouse_inst_factor']) / 2

    # b. Apply overall upward trend
    df['trend_factor'] = (df['Date'] - df['Date'].min()).dt.days * overall_trend_multiplier
    
    # c. Apply weekly seasonality
    df['day_of_week'] = df['Date'].dt.dayofweek
    df['weekly_factor'] = df['day_of_week'].map(weekly_demand_factor)

    # d. Apply annual seasonality (Monsoon/Summer)
    df['month'] = df['Date'].dt.month
    df['seasonal_factor'] = 1.0
    
    # Monsoon
    is_monsoon = df['month'].isin(monsoon_months)
    is_monsoon_category = df['ATC_Category'].isin(monsoon_impact_categories)
    df.loc[is_monsoon & is_monsoon_category, 'seasonal_factor'] = seasonality_impact_multiplier

    # Summer
    is_summer = df['month'].isin(summer_months)
    is_summer_category = df['ATC_Category'].isin(summer_impact_categories)
    df.loc[is_summer & is_summer_category, 'seasonal_factor'] = seasonality_impact_multiplier

    # e. Calculate final stock out
    base_stock_out = df['base_demand'] * (1 + df['trend_factor']) * df['weekly_factor'] * df['seasonal_factor']
    random_noise = np.random.uniform(1 - random_noise_factor, 1 + random_noise_factor, size=len(df))
    df['Stock_Out'] = np.round(base_stock_out * random_noise).astype(int)
    df.loc[df['Stock_Out'] < 0, 'Stock_Out'] = 0 # Ensure non-negative demand

    # --- 2. Simulate Inventory Flow ---
    print("Simulating inventory flow (Stock In, Opening/Closing Stock)...")
    df = df.sort_values(['Warehouse', 'Drug', 'Date']).reset_index(drop=True)

    df['Opening_Stock'] = 0
    df['Stock_In'] = 0
    df['Closing_Stock'] = 0

    # Process each item's inventory timeline
    for warehouse in warehouses_config.keys():
        for drug in all_drugs:
            # Get a mask for the current item
            item_mask = (df['Warehouse'] == warehouse) & (df['Drug'] == drug)
            item_indices = df.index[item_mask]

            # Set initial stock
            initial_stock_base = df.loc[item_indices[0], 'base_demand']
            initial_stock = int(initial_stock_base * initial_stock_multiplier)
            df.loc[item_indices[0], 'Opening_Stock'] = initial_stock
            
            # Calculate stock replenishment quantity for this item
            avg_demand_last_30_days = df.loc[item_indices, 'Stock_Out'].head(30).mean()
            stock_in_qty = int(avg_demand_last_30_days * monthly_stock_in_days_target)


            # Iterate through the timeline for this item
            for i in range(len(item_indices)):
                current_idx = item_indices[i]
                
                # Set opening stock from previous day's closing stock
                if i > 0:
                    prev_idx = item_indices[i-1]
                    df.loc[current_idx, 'Opening_Stock'] = df.loc[prev_idx, 'Closing_Stock']

                # Monthly stock-in on the 1st of the month
                if df.loc[current_idx, 'Date'].day == 1:
                    df.loc[current_idx, 'Stock_In'] = stock_in_qty

                # Available stock for the day
                available_stock = df.loc[current_idx, 'Opening_Stock'] + df.loc[current_idx, 'Stock_In']
                
                # Actual stock out cannot exceed available stock
                actual_stock_out = min(df.loc[current_idx, 'Stock_Out'], available_stock)
                df.loc[current_idx, 'Stock_Out'] = actual_stock_out
                
                # Calculate closing stock
                df.loc[current_idx, 'Closing_Stock'] = available_stock - actual_stock_out


    # --- 3. Final Cleanup ---
    print("Cleaning up and saving the file...")
    final_cols = [
        'Date', 'Warehouse', 'Drug', 'ATC_Category', 'Opening_Stock',
        'Stock_In', 'Stock_Out', 'Closing_Stock'
    ]
    df = df[final_cols]

    # Save to CSV
    output_filename = "synthetic_drug_demand_data.csv"
    df.to_csv(output_filename, index=False)

    print(f"\nSuccessfully generated synthetic data and saved it to '{output_filename}'")
    print(f"Total rows generated: {len(df)}")
    print("\nData Head:")
    print(df.head())
    print("\nData Tail:")
    print(df.tail())

if __name__ == '__main__':
    create_synthetic_drug_demand_data()


In [5]:
# For creating updated dataset with financial KPIs
import pandas as pd
import numpy as np
from datetime import timedelta

def create_enhanced_synthetic_data():
    """
    Generates an enhanced synthetic dataset for drug demand forecasting.
    VERSION 3: Rounds all financial value columns to 2 decimal places at the source.
    """

    # ==============================================================================
    # --- CONFIGURATION (No changes here) ---
    # ==============================================================================
    start_date = "2021-01-01"
    end_date = "2023-12-31"
    drugs_config = {
        "J: Antiinfectives for systemic use": ["Amoxicillin", "Doxycycline", "Ciprofloxacin", "Azithromycin"],
        "C: Cardiovascular system": ["Amlodipine", "Atorvastatin", "Metoprolol", "Lisinopril"],
        "N: Nervous system": ["Paracetamol", "Ibuprofen", "Sertraline", "Escitalopram"],
        "R: Respiratory system": ["Salbutamol", "Montelukast", "Cetirizine", "Loratadine"],
        "A: Alimentary tract and metabolism": ["Metformin", "Omeprazole", "Ranitidine", "Gliclazide"]
    }
    drug_prices = {
        'Amoxicillin': 2.50, 'Doxycycline': 4.00, 'Ciprofloxacin': 3.75, 'Azithromycin': 8.50,
        'Amlodipine': 1.50, 'Atorvastatin': 5.00, 'Metoprolol': 2.20, 'Lisinopril': 1.80,
        'Paracetamol': 0.75, 'Ibuprofen': 1.25, 'Sertraline': 10.00, 'Escitalopram': 12.00,
        'Salbutamol': 3.50, 'Montelukast': 7.00, 'Cetirizine': 0.50, 'Loratadine': 0.60,
        'Metformin': 1.00, 'Omeprazole': 2.80, 'Ranitidine': 1.50, 'Gliclazide': 3.00
    }
    warehouses_data = {
        'Thiruvananthapuram': {'population': 3301427, 'institutions': 90}, 'Kollam': {'population': 2635375, 'institutions': 75},
        'Pathanamthitta': {'population': 1197412, 'institutions': 50}, 'Alappuzha': {'population': 2127789, 'institutions': 65},
        'Kottayam': {'population': 1974551, 'institutions': 60}, 'Idukki': {'population': 1108974, 'institutions': 45},
        'Ernakulam': {'population': 3282388, 'institutions': 88}, 'Thrissur': {'population': 3121200, 'institutions': 85},
        'Palakkad': {'population': 2810892, 'institutions': 80}, 'Malappuram': {'population': 4112920, 'institutions': 95},
        'Kozhikode': {'population': 3086293, 'institutions': 82}, 'Wayanad': {'population': 817420, 'institutions': 40},
        'Kannur': {'population': 2523003, 'institutions': 78}, 'Kasaragod': {'population': 1307375, 'institutions': 48}
    }

    # ==============================================================================
    # --- DATA GENERATION (Placeholder Logic) ---
    # ==============================================================================
    print("Generating base data frame...")
    date_range = pd.to_datetime(pd.date_range(start=start_date, end=end_date))
    all_drugs = []
    for cat, drugs in drugs_config.items():
        for drug in drugs:
            all_drugs.append({'Drug': drug, 'ATC_Category': cat})
    drugs_df = pd.DataFrame(all_drugs)
    warehouses_list = list(warehouses_data.keys())
    df = pd.MultiIndex.from_product([date_range, warehouses_list, drugs_df['Drug']], names=['Date', 'Warehouse', 'Drug']).to_frame(index=False)
    df = pd.merge(df, drugs_df, on='Drug')

    df['Stock_Out'] = np.random.randint(5, 50, size=len(df))
    df['Opening_Stock'] = np.random.randint(500, 2000, size=len(df))
    df['Stock_In'] = 0
    df['Closing_Stock'] = df['Opening_Stock'] - df['Stock_Out']

    # ==============================================================================
    # --- ENRICHING DATA & FINALIZING ---
    # ==============================================================================
    print("Enriching data with financial information...")
    prices_df = pd.DataFrame(list(drug_prices.items()), columns=['Drug', 'Unit_Price'])
    df = pd.merge(df, prices_df, on='Drug', how='left')

    df['Value_Stock_Out'] = df['Stock_Out'] * df['Unit_Price']
    df['Value_Opening_Stock'] = df['Opening_Stock'] * df['Unit_Price']
    df['Value_Closing_Stock'] = df['Closing_Stock'] * df['Unit_Price']

    # --- FIX: Round all financial columns to 2 decimal places ---
    value_cols = ['Value_Stock_Out', 'Value_Opening_Stock', 'Value_Closing_Stock']
    for col in value_cols:
        df[col] = df[col].round(2)

    stock_cols = ['Opening_Stock', 'Stock_In', 'Stock_Out', 'Closing_Stock']
    for col in stock_cols:
        df[col] = df[col].astype(int)

    print("Finalizing the dataset...")
    output_filename = "synthetic_drug_demand_data_enhanced.csv"
    
    final_cols = [
        'Date', 'Warehouse', 'Drug', 'ATC_Category', 'Opening_Stock', 'Stock_In', 'Stock_Out', 'Closing_Stock',
        'Unit_Price', 'Value_Stock_Out', 'Value_Opening_Stock', 'Value_Closing_Stock'
    ]
    df[final_cols].to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully generated clean, formatted data and saved to '{output_filename}'")
    
if __name__ == '__main__':
    create_enhanced_synthetic_data()



Generating base data frame...
Enriching data with financial information...
Finalizing the dataset...

Successfully generated clean, formatted data and saved to 'synthetic_drug_demand_data_enhanced.csv'


In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

def create_enhanced_synthetic_data():
    """
    Generates an enhanced synthetic dataset for drug demand forecasting.
    VERSION 4: Adds more realistic variance between warehouses based on population.
    """
    print("Starting enhanced data generation (v4)...")
    # ==============================================================================
    # --- CONFIGURATION (No changes here) ---
    # ==============================================================================
    start_date = "2022-01-01"
    end_date = "2025-09-20"
    drugs_config = {
        "J: Antiinfectives for systemic use": ["Amoxicillin", "Doxycycline", "Ciprofloxacin", "Azithromycin"],
        "C: Cardiovascular system": ["Amlodipine", "Atorvastatin", "Metoprolol", "Lisinopril"],
        "N: Nervous system": ["Paracetamol", "Ibuprofen", "Sertraline", "Escitalopram"],
        "R: Respiratory system": ["Salbutamol", "Montelukast", "Cetirizine", "Loratadine"],
        "A: Alimentary tract and metabolism": ["Metformin", "Omeprazole", "Ranitidine", "Gliclazide"]
    }
    drug_prices = {
        'Amoxicillin': 2.50, 'Doxycycline': 4.00, 'Ciprofloxacin': 3.75, 'Azithromycin': 8.50,
        'Amlodipine': 1.50, 'Atorvastatin': 5.00, 'Metoprolol': 2.20, 'Lisinopril': 1.80,
        'Paracetamol': 0.75, 'Ibuprofen': 1.25, 'Sertraline': 10.00, 'Escitalopram': 12.00,
        'Salbutamol': 3.50, 'Montelukast': 7.00, 'Cetirizine': 0.50, 'Loratadine': 0.60,
        'Metformin': 1.00, 'Omeprazole': 2.80, 'Ranitidine': 1.50, 'Gliclazide': 3.00
    }
    warehouses_data = {
        'Thiruvananthapuram': {'population': 3301427, 'institutions': 90}, 'Kollam': {'population': 2635375, 'institutions': 75},
        'Pathanamthitta': {'population': 1197412, 'institutions': 50}, 'Alappuzha': {'population': 2127789, 'institutions': 65},
        'Kottayam': {'population': 1974551, 'institutions': 60}, 'Idukki': {'population': 1108974, 'institutions': 45},
        'Ernakulam': {'population': 3282388, 'institutions': 88}, 'Thrissur': {'population': 3121200, 'institutions': 85},
        'Palakkad': {'population': 2810892, 'institutions': 80}, 'Malappuram': {'population': 4112920, 'institutions': 95},
        'Kozhikode': {'population': 3086293, 'institutions': 82}, 'Wayanad': {'population': 817420, 'institutions': 40},
        'Kannur': {'population': 2523003, 'institutions': 78}, 'Kasaragod': {'population': 1307375, 'institutions': 48}
    }

    # ==============================================================================
    # --- DATA GENERATION with MORE REALISM ---
    # ==============================================================================
    print("Generating base data frame with realistic variance...")
    date_range = pd.to_datetime(pd.date_range(start=start_date, end=end_date))
    all_drugs = []
    for cat, drugs in drugs_config.items():
        for drug in drugs:
            all_drugs.append({'Drug': drug, 'ATC_Category': cat})
    drugs_df = pd.DataFrame(all_drugs)
    
    # Create the base structure
    warehouses_list = list(warehouses_data.keys())
    df = pd.MultiIndex.from_product([date_range, warehouses_list, drugs_df['Drug']], names=['Date', 'Warehouse', 'Drug']).to_frame(index=False)
    df = pd.merge(df, drugs_df, on='Drug')

    # --- NEW: More realistic demand simulation ---
    # Create a warehouse multiplier based on population
    total_population = sum(w['population'] for w in warehouses_data.values())
    warehouse_multipliers = {name: (data['population'] / total_population) * len(warehouses_data) for name, data in warehouses_data.items()}
    
    # Map multipliers to the DataFrame
    df['wh_multiplier'] = df['Warehouse'].map(warehouse_multipliers)
    
    # Base demand now uses the multiplier + more randomness
    base_demand = np.random.randint(20, 80, size=len(df))
    df['Stock_Out'] = (base_demand * df['wh_multiplier']).astype(int)

    # (Simplified inventory logic for this script)
    df['Opening_Stock'] = np.random.randint(500, 2000, size=len(df)) * df['wh_multiplier']
    df['Stock_In'] = 0
    df['Closing_Stock'] = df['Opening_Stock'] - df['Stock_Out']
    
    # ==============================================================================
    # --- ENRICHING DATA & FINALIZING ---
    # ==============================================================================
    print("Enriching data and finalizing...")
    prices_df = pd.DataFrame(list(drug_prices.items()), columns=['Drug', 'Unit_Price'])
    df = pd.merge(df, prices_df, on='Drug', how='left')

    df['Value_Stock_Out'] = (df['Stock_Out'] * df['Unit_Price']).round(2)
    df['Value_Opening_Stock'] = (df['Opening_Stock'] * df['Unit_Price']).round(2)
    df['Value_Closing_Stock'] = (df['Closing_Stock'] * df['Unit_Price']).round(2)

    stock_cols = ['Opening_Stock', 'Stock_In', 'Stock_Out', 'Closing_Stock']
    for col in stock_cols:
        df[col] = df[col].astype(int)
    
    # Drop the temporary multiplier column
    df.drop(columns=['wh_multiplier'], inplace=True)

    output_filename = "synthetic_drug_demand_data_enhanced.csv"
    df.to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully generated new enhanced data and saved to '{output_filename}'")
    
if __name__ == '__main__':
    create_enhanced_synthetic_data()



Starting enhanced data generation (v4)...
Generating base data frame with realistic variance...
Enriching data and finalizing...

Successfully generated new enhanced data and saved to 'synthetic_drug_demand_data_enhanced.csv'
