In [1]:
import random
import csv

damage_types = ["dent", "scratch", "crack", "glass shatter", "lamp broken", "tire flat"]

parts = [
    "back_bumper", "back_door", "back_glass", "back_light",
    "front_bumper", "front_door", "front_glass", "front_light",
    "hood"
]

severity_levels = ["minor", "moderate", "severe"]

# Base part costs (Indian market realistic 2024â€“2025)
part_base_cost = {
    "back_bumper": 4500,
    "back_door": 6500,
    "back_glass": 9000,
    "back_light": 2500,
    "front_bumper": 5000,
    "front_door": 7000,
    "front_glass": 12000,
    "front_light": 3000,
    "hood": 8500
}

# Severity multipliers
severity_multiplier = {
    "minor": 0.6,
    "moderate": 1.0,
    "severe": 1.6
}

# Damage type multipliers
damage_type_multiplier = {
    "scratch": 0.4,
    "dent": 0.9,
    "crack": 1.1,
    "lamp broken": 1.6,
    "glass shatter": 2.3,
    "tire flat": 0.7
}

def generate_cost(part, severity, damage_type, coverage_percent):
    base = part_base_cost[part]
    sev_mul = severity_multiplier[severity]
    dmg_mul = damage_type_multiplier[damage_type]
    coverage_factor = 1 + (coverage_percent / 100 * 0.5)

    # Cost formula
    cost = base * sev_mul * dmg_mul * coverage_factor

    # add India labour randomization noise
    cost *= random.uniform(0.9, 1.15)

    return int(cost)

def generate_dataset(n_rows=5000, output_file="stage5_cost_dataset.csv"):
    with open(output_file, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["part", "severity", "damage_type", "coverage_percent", "cost"])

        for _ in range(n_rows):
            part = random.choice(parts)
            damage_type = random.choice(damage_types)

            # Severity probability based on real-world distribution
            severity = random.choices(
                severity_levels,
                weights=[0.55, 0.35, 0.10],  # most indian claims are minor
                k=1
            )[0]

            coverage_percent = round(random.uniform(1, 25), 2)

            cost = generate_cost(part, severity, damage_type, coverage_percent)

            writer.writerow([part, severity, damage_type, coverage_percent, cost])

    print(f"Dataset generated: {output_file}")

# Run generator
generate_dataset(6000)


Dataset generated: stage5_cost_dataset.csv
