In [1]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker to generate fake data
fake = Faker()

# --- Configuration ---
NUM_RECORDS = 5000
COMPLAINT_RATE = 0.08  # 8% of jobs will result in a complaint
SLA_DAYS = 28 # Define the Service Level Agreement in days

# --- Data Generation ---
data = []
for _ in range(NUM_RECORDS):
    # Base Features
    time_to_resolution = np.random.randint(1, 90)
    chaser_count = np.random.randint(0, 6)
    property_complaint_history = np.random.randint(0, 4)

    # Engineered Features based on logic
    sla_breach_flag = 1 if time_to_resolution > SLA_DAYS else 0

    # --- Target Variable Logic ---
    # Create a 'risk score' based on the features. Higher score = higher complaint likelihood.
    risk_score = 0
    if chaser_count >= 2:
        risk_score += 0.5  # Heavy penalty for 2 or more chaser calls
    if time_to_resolution > 45:
        risk_score += 0.3  # Penalty for very long resolution times
    if property_complaint_history > 1:
        risk_score += 0.2  # Penalty for properties with a history of issues
    
    # Determine if a complaint occurs based on the risk score and the overall complaint rate
    is_complaint = 1 if np.random.rand() < (COMPLAINT_RATE + risk_score) else 0
    
    data.append({
        'time_to_resolution': time_to_resolution,
        'chaser_count': chaser_count,
        'property_complaint_history': property_complaint_history,
        'sla_breach_flag': sla_breach_flag,
        'is_complaint': is_complaint
    })

# --- Create DataFrame and Save ---
df = pd.DataFrame(data)

# Ensure the complaint rate is roughly what we aimed for
# If there are too many or too few complaints, this loop will adjust it.
while abs(df['is_complaint'].mean() - COMPLAINT_RATE) > 0.01:
    # Find a random row to flip the complaint status
    flip_idx = np.random.randint(0, NUM_RECORDS)
    # Flip it towards the desired rate
    if df['is_complaint'].mean() > COMPLAINT_RATE:
        if df.loc[flip_idx, 'is_complaint'] == 1:
            df.loc[flip_idx, 'is_complaint'] = 0
    else:
        if df.loc[flip_idx, 'is_complaint'] == 0:
            df.loc[flip_idx, 'is_complaint'] = 1

# Save the synthetic data to a CSV file
df.to_csv('complaints_data.csv', index=False)

print(f"Successfully generated 'complaints_data.csv' with {len(df)} records.")
print(f"Final complaint rate: {df['is_complaint'].mean():.2%}")

Successfully generated 'complaints_data.csv' with 5000 records.
Final complaint rate: 9.00%
