# Automotive Warranty Analytics - Synthetic Data Generation Script
# Generates realistic datasets (vehicles, service records, warranty claims, sensor data)
# Includes intentional noise (missing values, typos, outliers) for data cleaning practice


In [3]:
import pandas as pd
import numpy as np
import os
from random import choice, randint, uniform
from datetime import datetime, timedelta

# --- 1. Folder setup ---
save_path = r"C:/Users/ronv1/Desktop/Automative_Warranty_Analytics/data/raw"
os.makedirs(save_path, exist_ok=True)

# --- 2. Vehicles Table ---
num_vehicles = 10000
vehicle_ids = [f"V{10000+i}" for i in range(num_vehicles)]
models = ["EcoDrive", "SpeedX", "TurboPro", "AutoMax", "UrbanGo"]
regions = ["North", "South", "East", "West"]

vehicles = pd.DataFrame({
    "vehicle_id": vehicle_ids,
    "model": [choice(models) for _ in range(num_vehicles)],
    "year": [randint(2015, 2023) for _ in range(num_vehicles)],
    "mileage": [randint(5000, 120000) if randint(0,20)!=0 else np.nan for _ in range(num_vehicles)],
    "region": [choice(regions + ["south zone", "SOUTH"]) for _ in range(num_vehicles)]
})

# Introduce minor typos in model names (~2%)
for i in np.random.choice(vehicles.index, size=int(0.02*num_vehicles), replace=False):
    vehicles.at[i, "model"] = vehicles.at[i, "model"][:-1] + choice(['X',''])

vehicles.to_csv(os.path.join(save_path, "vehicles_raw.csv"), index=False)

# --- 3. Service Records Table ---
num_services = 60000
service_ids = [f"S{20000+i}" for i in range(num_services)]
components = ["Engine", "Brake Pad", "Battery", "Suspension", "Transmission", "AC"]
issue_types = ["Wear & Tear", "Overheating", "Discharge", "Leakage", "Malfunction"]

service_records = pd.DataFrame({
    "service_id": service_ids,
    "vehicle_id": [choice(vehicle_ids) for _ in range(num_services)],
    "service_date": [datetime(2022,1,1) + timedelta(days=randint(0,1000)) for _ in range(num_services)],
    "component_replaced": [choice(components) if randint(0,50)!=0 else np.nan for _ in range(num_services)],
    "cost": [randint(1000,15000) if randint(0,50)!=0 else -randint(1,5000) for _ in range(num_services)],
    "issue_type": [choice(issue_types) for _ in range(num_services)]
})

service_records.to_csv(os.path.join(save_path, "service_records_raw.csv"), index=False)

# --- 4. Warranty Claims Table ---
num_claims = 20000
claim_ids = [f"W{30000+i}" for i in range(num_claims)]
statuses = ["Approved", "Rejected", "Apprvd"]

warranty_claims = pd.DataFrame({
    "claim_id": claim_ids,
    "vehicle_id": [choice(vehicle_ids) for _ in range(num_claims)],
    "claim_date": [datetime(2022,1,1) + timedelta(days=randint(0,1000)) for _ in range(num_claims)],
    "claim_amount": [randint(1000,15000) for _ in range(num_claims)],
    "claim_status": [choice(statuses) for _ in range(num_claims)],
    "component": [choice(components) for _ in range(num_claims)]
})

warranty_claims.to_csv(os.path.join(save_path, "warranty_claims_raw.csv"), index=False)

# --- 5. Sensor Data Table ---
num_sensor_rows = 100000
sensor_vehicle_ids = [choice(vehicle_ids) for _ in range(num_sensor_rows)]

sensor_data = pd.DataFrame({
    "vehicle_id": sensor_vehicle_ids,
    "timestamp": [datetime(2023,1,1) + timedelta(days=randint(0,300), hours=randint(0,23), minutes=randint(0,59)) for _ in range(num_sensor_rows)],
    "engine_temp": [randint(70,120) if randint(0,50)!=0 else randint(200,300) for _ in range(num_sensor_rows)],
    "rpm": [randint(1500,3000) if randint(0,50)!=0 else "error" for _ in range(num_sensor_rows)],
    "fuel_efficiency": [round(uniform(10,20),1) if randint(0,50)!=0 else round(uniform(2,5),1) for _ in range(num_sensor_rows)],
    "error_code": [choice(["E201","E404","E303","None"]) for _ in range(num_sensor_rows)]
})

sensor_data.to_csv(os.path.join(save_path, "sensor_data_raw.csv"), index=False)

print("Data generation complete! CSVs saved in data/raw/")


Data generation complete! CSVs saved in data/raw/
