In [1]:
"""
HYFUN FOODS - COMPLETE DATA GENERATION SCRIPT
Generates 3 years of realistic business data (2022-2024)

Author: Data Analytics Project
Date: December 2024
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from faker import Faker
import random
import warnings
warnings.filterwarnings('ignore')

# Initialize
fake = Faker('en_IN')
random.seed(42)
np.random.seed(42)

# Date range
START_DATE = datetime(2022, 1, 1)
END_DATE = datetime(2024, 12, 31)
date_range = pd.date_range(START_DATE, END_DATE, freq='D')

print("=" * 70)
print("HYFUN FOODS - DATA GENERATION STARTED")
print("=" * 70)

# ============================================================================
# 1. FARMERS MASTER DATA
# ============================================================================
print("\n[1/12] Generating Farmers Master Data...")

regions = ['North Gujarat', 'South Gujarat', 'Central Gujarat', 'Saurashtra', 'Kutch']
num_farmers = 500

farmers_data = {
    'farmer_id': [f'F{str(i).zfill(4)}' for i in range(1, num_farmers + 1)],
    'farmer_name': [fake.name() for _ in range(num_farmers)],
    'region': [random.choice(regions) for _ in range(num_farmers)],
    'farm_size_acres': np.random.randint(2, 50, num_farmers),
    'contract_start_date': [fake.date_between(start_date='-5y', end_date='-1y') for _ in range(num_farmers)],
    'experience_years': np.random.randint(3, 25, num_farmers),
    'contact_number': [fake.phone_number() for _ in range(num_farmers)],
    'bank_account': [fake.iban() for _ in range(num_farmers)]
}

df_farmers = pd.DataFrame(farmers_data)
print(f"‚úì Generated {len(df_farmers)} farmers across {len(regions)} regions")

# ============================================================================
# 2. POTATO PROCUREMENT DATA
# ============================================================================
print("\n[2/12] Generating Potato Procurement Data...")

potato_varieties = ['FL-2027', 'FL-1867', '3797', 'Kufri Chipsona']
quality_grades = ['Premium', 'Grade A', 'Grade B', 'Grade C']
procurement_records = []

for farmer_id in df_farmers['farmer_id']:
    num_deliveries = random.randint(4, 12)  # 4-12 deliveries per year per farmer
    
    for _ in range(num_deliveries):
        # Seasonal pattern - more procurement in Oct-Mar (harvest season)
        date = random.choice(date_range)
        month = date.month
        
        # Higher quantity during harvest months
        if month in [10, 11, 12, 1, 2, 3]:
            quantity = np.random.uniform(5, 30)  # MT
            price = np.random.uniform(15000, 22000)  # INR per MT
        else:
            quantity = np.random.uniform(2, 10)
            price = np.random.uniform(20000, 28000)  # Higher prices off-season
        
        quality = random.choices(quality_grades, weights=[0.25, 0.40, 0.25, 0.10])[0]
        
        procurement_records.append({
            'batch_id': f'PB{datetime.now().strftime("%Y%m%d")}{random.randint(1000, 9999)}',
            'farmer_id': farmer_id,
            'procurement_date': date,
            'quantity_mt': round(quantity, 2),
            'variety': random.choice(potato_varieties),
            'quality_grade': quality,
            'price_per_mt': round(price, 2),
            'moisture_content': round(np.random.uniform(70, 82), 1),
            'defect_percentage': round(np.random.uniform(1, 8), 1)
        })

df_procurement = pd.DataFrame(procurement_records)
print(f"‚úì Generated {len(df_procurement)} procurement records")

# ============================================================================
# 3. PRODUCTION BATCHES DATA
# ============================================================================
print("\n[3/12] Generating Production Data...")

plants = ['Ahmedabad Plant 1', 'Ahmedabad Plant 2', 'Rajkot Plant']
product_skus = [
    'HF-FF-SHOESTRING-2.5KG', 'HF-FF-CRINKLE-2.5KG', 'HF-FF-WEDGES-2.5KG',
    'HF-PATTY-ALOO-1KG', 'HF-PATTY-BURGER-1KG', 'HF-PATTY-CUTLET-1KG',
    'HF-HASH-BROWN-1KG', 'HF-POTATO-SMILES-500G', 'HF-POTATO-CHEESE-BALLS-500G',
    'HF-ALOO-TIKKI-1KG'
]

shifts = ['Morning', 'Evening', 'Night']
production_records = []

for date in date_range:
    num_batches = random.randint(8, 15)  # Daily production batches
    
    for _ in range(num_batches):
        raw_material = round(np.random.uniform(2, 8), 2)  # MT
        conversion_rate = np.random.uniform(0.75, 0.88)  # 75-88% yield
        finished_goods = round(raw_material * conversion_rate, 2)
        
        production_records.append({
            'batch_id': f'PR{date.strftime("%Y%m%d")}{random.randint(100, 999)}',
            'production_date': date,
            'product_sku': random.choice(product_skus),
            'raw_material_used_mt': raw_material,
            'finished_goods_mt': finished_goods,
            'plant_location': random.choice(plants),
            'shift': random.choice(shifts),
            'operator_id': f'OP{random.randint(100, 250)}',
            'temperature_celsius': round(np.random.uniform(-18, -20), 1),
            'processing_time_hours': round(np.random.uniform(2, 6), 1)
        })

df_production = pd.DataFrame(production_records)
print(f"‚úì Generated {len(df_production)} production batches")

# ============================================================================
# 4. QUALITY CONTROL DATA
# ============================================================================
print("\n[4/12] Generating Quality Control Data...")

qc_records = []

for idx, batch in df_production.iterrows():
    # Not all batches get QC (random sampling)
    if random.random() < 0.3:  # 30% QC rate
        qc_records.append({
            'qc_id': f'QC{idx + 1:06d}',
            'batch_id': batch['batch_id'],
            'inspection_date': batch['production_date'] + timedelta(hours=random.randint(1, 8)),
            'moisture_level': round(np.random.uniform(72, 80), 1),
            'oil_content': round(np.random.uniform(3, 8), 1),
            'defect_rate': round(np.random.uniform(0.5, 5), 2),
            'brc_compliance_score': random.randint(85, 100),
            'inspector_name': fake.name(),
            'status': random.choices(['Approved', 'Rejected'], weights=[0.95, 0.05])[0]
        })

df_qc = pd.DataFrame(qc_records)
print(f"‚úì Generated {len(df_qc)} quality control records")

# ============================================================================
# 5. MACHINE DOWNTIME DATA
# ============================================================================
print("\n[5/12] Generating Machine Downtime Data...")

machines = [f'Machine-{i}' for i in range(1, 26)]  # 25 machines
downtime_reasons = ['Scheduled Maintenance', 'Breakdown', 'Power Failure', 
                    'Raw Material Shortage', 'Operator Error', 'Cleaning']

downtime_records = []

for date in date_range:
    # Random downtime events (1-3 per day)
    num_events = random.randint(0, 3)
    
    for _ in range(num_events):
        start_time = datetime.combine(date, datetime.min.time()) + timedelta(hours=random.randint(0, 23))
        duration_hours = np.random.uniform(0.5, 8)
        end_time = start_time + timedelta(hours=duration_hours)
        
        production_loss = round(np.random.uniform(0.5, 3), 2)  # MT lost
        
        downtime_records.append({
            'downtime_id': f'DT{len(downtime_records) + 1:06d}',
            'machine_id': random.choice(machines),
            'plant_location': random.choice(plants),
            'start_time': start_time,
            'end_time': end_time,
            'duration_hours': round(duration_hours, 2),
            'reason': random.choice(downtime_reasons),
            'production_loss_mt': production_loss,
            'repair_cost_inr': round(np.random.uniform(5000, 50000), 2)
        })

df_downtime = pd.DataFrame(downtime_records)
print(f"‚úì Generated {len(df_downtime)} downtime events")

# ============================================================================
# 6. WASTAGE TRACKING DATA
# ============================================================================
print("\n[6/12] Generating Wastage Data...")

wastage_types = ['Processing Waste', 'Quality Rejection', 'Spillage', 'Expired Stock']
wastage_records = []

for idx, batch in df_production.iterrows():
    if random.random() < 0.15:  # 15% batches have wastage
        wastage_qty = round(batch['finished_goods_mt'] * np.random.uniform(0.02, 0.10) * 1000, 2)  # kg
        
        wastage_records.append({
            'wastage_id': f'WS{idx + 1:06d}',
            'batch_id': batch['batch_id'],
            'wastage_date': batch['production_date'],
            'wastage_type': random.choice(wastage_types),
            'quantity_kg': wastage_qty,
            'recovery_possible': random.choice(['Yes', 'No']),
            'cost_impact_inr': round(wastage_qty * np.random.uniform(80, 150), 2)
        })

df_wastage = pd.DataFrame(wastage_records)
print(f"‚úì Generated {len(df_wastage)} wastage records")

# ============================================================================
# 7. B2B CUSTOMERS DATA
# ============================================================================
print("\n[7/12] Generating B2B Customers Data...")

customer_types = ['QSR Chain', 'Food Service', 'Distributor', 'Retail Chain', 'Export Client']
countries = ['India', 'UAE', 'Saudi Arabia', 'Oman', 'Kuwait', 'Qatar', 'Bahrain',
             'Singapore', 'Malaysia', 'Bangladesh', 'Nepal', 'Sri Lanka',
             'Kenya', 'South Africa', 'UK']

b2b_customers = []

for i in range(1, 201):  # 200 B2B customers
    customer_country = random.choice(countries)
    
    b2b_customers.append({
        'customer_id': f'C{str(i).zfill(5)}',
        'company_name': fake.company(),
        'customer_type': random.choice(customer_types),
        'country': customer_country,
        'city': fake.city(),
        'onboarding_date': fake.date_between(start_date='-5y', end_date='-1y'),
        'credit_limit_inr': random.randint(500000, 50000000),
        'credit_period_days': random.choice([30, 45, 60, 90]),
        'primary_contact': fake.name(),
        'email': fake.company_email(),
        'phone': fake.phone_number()
    })

df_b2b_customers = pd.DataFrame(b2b_customers)
print(f"‚úì Generated {len(df_b2b_customers)} B2B customers")

# ============================================================================
# 8. B2B ORDERS DATA
# ============================================================================
print("\n[8/12] Generating B2B Orders Data...")

payment_statuses = ['Paid', 'Pending', 'Delayed', 'Partial']
b2b_orders = []

for customer in df_b2b_customers.itertuples():
    # Each customer places 5-30 orders over 3 years
    num_orders = random.randint(5, 30)
    
    for _ in range(num_orders):
        order_date = random.choice(date_range)
        delivery_days = random.randint(3, 15)
        delivery_date = order_date + timedelta(days=delivery_days)
        
        product_sku = random.choice(product_skus)
        quantity_kg = round(np.random.uniform(500, 5000), 2)
        
        # Export customers pay more
        if customer.country != 'India':
            unit_price = round(np.random.uniform(120, 180), 2)
        else:
            unit_price = round(np.random.uniform(90, 130), 2)
        
        b2b_orders.append({
            'order_id': f'ORD{len(b2b_orders) + 1:08d}',
            'customer_id': customer.customer_id,
            'order_date': order_date,
            'product_sku': product_sku,
            'quantity_kg': quantity_kg,
            'unit_price_inr': unit_price,
            'total_value_inr': round(quantity_kg * unit_price, 2),
            'delivery_date': delivery_date,
            'payment_status': random.choices(payment_statuses, weights=[0.70, 0.15, 0.10, 0.05])[0],
            'invoice_number': f'INV{random.randint(100000, 999999)}',
            'payment_terms': f'Net {customer.credit_period_days}'
        })

df_b2b_orders = pd.DataFrame(b2b_orders)
print(f"‚úì Generated {len(df_b2b_orders)} B2B orders")

# ============================================================================
# 9. EXPORT SHIPMENTS DATA
# ============================================================================
print("\n[9/12] Generating Export Shipments Data...")

shipping_methods = ['Air Cargo', 'Sea Freight', 'Road Transport']
export_shipments = []

export_orders = df_b2b_orders.merge(df_b2b_customers[['customer_id', 'country']], on='customer_id')
export_orders = export_orders[export_orders['country'] != 'India']

for order in export_orders.itertuples():
    departure_date = order.delivery_date - timedelta(days=random.randint(5, 20))
    
    if order.country in ['UAE', 'Saudi Arabia', 'Oman', 'Kuwait', 'Qatar', 'Bahrain']:
        transit_days = random.randint(3, 8)
    elif order.country in ['Singapore', 'Malaysia', 'Bangladesh']:
        transit_days = random.randint(5, 12)
    else:
        transit_days = random.randint(10, 25)
    
    arrival_date = departure_date + timedelta(days=transit_days)
    
    export_shipments.append({
        'shipment_id': f'SHIP{len(export_shipments) + 1:07d}',
        'order_id': order.order_id,
        'destination_country': order.country,
        'shipping_method': random.choice(shipping_methods),
        'departure_date': departure_date,
        'arrival_date': arrival_date,
        'transit_days': transit_days,
        'customs_cleared': random.choices(['Yes', 'No', 'Pending'], weights=[0.85, 0.05, 0.10])[0],
        'shipping_cost_inr': round(np.random.uniform(50000, 300000), 2),
        'container_number': f'CONT{random.randint(100000, 999999)}'
    })

df_export_shipments = pd.DataFrame(export_shipments)
print(f"‚úì Generated {len(df_export_shipments)} export shipments")

# ============================================================================
# 10. B2C SALES DATA
# ============================================================================
print("\n[10/12] Generating B2C Sales Data...")

b2c_cities = ['Ahmedabad', 'Surat', 'Vadodara', 'Rajkot', 'Gandhinagar', 'Bhavnagar', 'Anand']
retail_channels = ['Modern Trade', 'Kirana Stores', 'Online Platform', 'Own Stores']
b2c_products = product_skus[:8]  # First 8 products for retail

b2c_sales = []

for date in date_range:
    # Daily transactions across cities
    num_transactions = random.randint(50, 200)
    
    for _ in range(num_transactions):
        product = random.choice(b2c_products)
        city = random.choice(b2c_cities)
        
        # City-wise demand variation
        if city in ['Ahmedabad', 'Surat']:
            quantity = random.randint(1, 5)
        else:
            quantity = random.randint(1, 3)
        
        mrp = round(np.random.uniform(150, 450), 2)
        discount = round(np.random.uniform(0, 25), 2)
        
        b2c_sales.append({
            'transaction_id': f'TXN{len(b2c_sales) + 1:09d}',
            'sale_date': date,
            'city': city,
            'product_sku': product,
            'quantity_units': quantity,
            'mrp': mrp,
            'discount_percent': discount,
            'final_price': round(mrp * (1 - discount/100), 2),
            'channel': random.choice(retail_channels),
            'customer_type': random.choice(['Regular', 'New', 'Loyalty Member'])
        })

df_b2c_sales = pd.DataFrame(b2c_sales)
print(f"‚úì Generated {len(df_b2c_sales)} B2C transactions")

# ============================================================================
# 11. PRODUCT MASTER DATA
# ============================================================================
print("\n[11/12] Generating Product Master Data...")

product_master = {
    'product_sku': product_skus,
    'product_name': [
        'French Fries Shoestring 2.5kg', 'French Fries Crinkle Cut 2.5kg', 'French Fries Wedges 2.5kg',
        'Aloo Patty 1kg', 'Burger Patty 1kg', 'Veg Cutlet 1kg',
        'Hash Browns 1kg', 'Potato Smiles 500g', 'Cheese Potato Balls 500g', 'Aloo Tikki 1kg'
    ],
    'category': ['French Fries', 'French Fries', 'French Fries',
                 'Patties', 'Patties', 'Patties',
                 'Specialty', 'Specialty', 'Specialty', 'Specialty'],
    'weight_kg': [2.5, 2.5, 2.5, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5, 1.0],
    'cost_price_inr': [180, 190, 195, 95, 110, 100, 105, 60, 75, 85],
    'b2b_price_inr': [240, 255, 260, 130, 150, 135, 145, 85, 105, 115],
    'b2c_mrp_inr': [349, 379, 399, 199, 229, 209, 219, 129, 159, 179],
    'launch_date': [
        datetime(2019, 3, 15), datetime(2019, 3, 15), datetime(2020, 1, 10),
        datetime(2018, 6, 20), datetime(2019, 8, 5), datetime(2020, 2, 14),
        datetime(2021, 4, 1), datetime(2020, 10, 15), datetime(2021, 7, 20), datetime(2022, 1, 10)
    ]
}

df_products = pd.DataFrame(product_master)
print(f"‚úì Generated {len(df_products)} product records")

# ============================================================================
# 12. REVENUE SUMMARY DATA
# ============================================================================
print("\n[12/12] Generating Revenue Summary Data...")

revenue_data = []

# Aggregate from B2B and B2C
for date in pd.date_range(START_DATE, END_DATE, freq='D'):
    # B2B revenue
    b2b_day = df_b2b_orders[df_b2b_orders['order_date'] == date]
    if len(b2b_day) > 0:
        for category in df_products['category'].unique():
            cat_products = df_products[df_products['category'] == category]['product_sku'].tolist()
            cat_orders = b2b_day[b2b_day['product_sku'].isin(cat_products)]
            
            if len(cat_orders) > 0:
                revenue = cat_orders['total_value_inr'].sum()
                cogs = revenue * 0.62  # 38% gross margin
                
                revenue_data.append({
                    'date': date,
                    'revenue_source': 'B2B',
                    'product_category': category,
                    'revenue_inr': round(revenue, 2),
                    'cogs_inr': round(cogs, 2),
                    'gross_margin_inr': round(revenue - cogs, 2)
                })
    
    # B2C revenue
    b2c_day = df_b2c_sales[df_b2c_sales['sale_date'] == date]
    if len(b2c_day) > 0:
        for category in df_products['category'].unique():
            cat_products = df_products[df_products['category'] == category]['product_sku'].tolist()
            cat_sales = b2c_day[b2c_day['product_sku'].isin(cat_products)]
            
            if len(cat_sales) > 0:
                revenue = (cat_sales['final_price'] * cat_sales['quantity_units']).sum()
                cogs = revenue * 0.55  # 45% gross margin (higher due to retail pricing)
                
                revenue_data.append({
                    'date': date,
                    'revenue_source': 'B2C',
                    'product_category': category,
                    'revenue_inr': round(revenue, 2),
                    'cogs_inr': round(cogs, 2),
                    'gross_margin_inr': round(revenue - cogs, 2)
                })

df_revenue = pd.DataFrame(revenue_data)
print(f"‚úì Generated {len(df_revenue)} revenue records")

# ============================================================================
# SAVE ALL DATA TO CSV
# ============================================================================
print("\n" + "=" * 70)
print("SAVING DATA TO CSV FILES...")
print("=" * 70)

output_folder = 'hyfun_data/'
import os
os.makedirs(output_folder, exist_ok=True)

datasets = {
    'farmers_master': df_farmers,
    'potato_procurement': df_procurement,
    'production_batches': df_production,
    'quality_control': df_qc,
    'machine_downtime': df_downtime,
    'wastage_tracking': df_wastage,
    'b2b_customers': df_b2b_customers,
    'b2b_orders': df_b2b_orders,
    'export_shipments': df_export_shipments,
    'b2c_sales': df_b2c_sales,
    'product_master': df_products,
    'revenue_summary': df_revenue
}

for name, df in datasets.items():
    filepath = f'{output_folder}{name}.csv'
    df.to_csv(filepath, index=False)
    print(f"‚úì Saved: {filepath} ({len(df)} records)")

# ============================================================================
# SUMMARY STATISTICS
# ============================================================================
print("\n" + "=" * 70)
print("DATA GENERATION COMPLETE - SUMMARY")
print("=" * 70)

print(f"\nüìä Total Records Generated: {sum(len(df) for df in datasets.values()):,}")
print(f"\nüí∞ Total B2B Revenue: ‚Çπ{df_b2b_orders['total_value_inr'].sum():,.2f}")
print(f"üí∞ Total B2C Revenue: ‚Çπ{(df_b2c_sales['final_price'] * df_b2c_sales['quantity_units']).sum():,.2f}")
print(f"üí∞ Combined Revenue: ‚Çπ{df_revenue['revenue_inr'].sum():,.2f}")

print(f"\nüåæ Total Potato Procured: {df_procurement['quantity_mt'].sum():,.2f} MT")
print(f"üè≠ Total Production: {df_production['finished_goods_mt'].sum():,.2f} MT")
print(f"üì¶ Total B2B Orders: {len(df_b2b_orders):,}")
print(f"üõí Total B2C Transactions: {len(df_b2c_sales):,}")

print(f"\nüåç B2B Customers: {len(df_b2b_customers)} across {df_b2b_customers['country'].nunique()} countries")
print(f"üèôÔ∏è B2C Cities: {df_b2c_sales['city'].nunique()} cities")

print("\n" + "=" * 70)
print("‚úÖ ALL DATA FILES READY FOR ANALYSIS!")
print("=" * 70)
print("\nüìÅ Next Steps:")
print("1. Import data into MySQL/PostgreSQL")
print("2. Run SQL analysis queries")
print("3. Perform Python analytics")
print("4. Build Power BI dashboards")
print("\nHappy Analyzing! üöÄ")

HYFUN FOODS - DATA GENERATION STARTED

[1/12] Generating Farmers Master Data...
‚úì Generated 500 farmers across 5 regions

[2/12] Generating Potato Procurement Data...
‚úì Generated 4050 procurement records

[3/12] Generating Production Data...
‚úì Generated 12592 production batches

[4/12] Generating Quality Control Data...
‚úì Generated 3727 quality control records

[5/12] Generating Machine Downtime Data...
‚úì Generated 1701 downtime events

[6/12] Generating Wastage Data...
‚úì Generated 1927 wastage records

[7/12] Generating B2B Customers Data...
‚úì Generated 200 B2B customers

[8/12] Generating B2B Orders Data...
‚úì Generated 3605 B2B orders

[9/12] Generating Export Shipments Data...
‚úì Generated 3338 export shipments

[10/12] Generating B2C Sales Data...
‚úì Generated 136887 B2C transactions

[11/12] Generating Product Master Data...
‚úì Generated 10 product records

[12/12] Generating Revenue Summary Data...
‚úì Generated 5463 revenue records

SAVING DATA TO CSV FILES...