In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
import re

# Set a seed for reproducibility
np.random.seed(42)
random.seed(42)

# Initialize Faker with Kenyan locale
fake = Faker('en_KE')

# --- 1. Define Kenyan Contextual Data ---

KENYAN_COUNTIES = {
    'Nairobi': 0.35,  # Higher income/taxpayers
    'Kiambu': 0.15,
    'Mombasa': 0.10,
    'Nakuru': 0.10,
    'Kisumu': 0.05,
    'Machakos': 0.05,
    'Rural_Uplands': 0.10, # Lower income/taxpayers
    'Rural_SemiArid': 0.10
}

INCOME_SOURCES = [
    'Employment (PAYE)', 
    'Business (Sole Proprietor)', 
    'Rental', 
    'Dividends/Interest', 
    'Farming', 
    'Informal_Sector'
]

BUSINESS_SECTORS = [
    'Retail/Wholesale', 
    'Service (Consultancy)', 
    'Tech/IT', 
    'Manufacturing', 
    'Finance', 
    'Agriculture/Farming', 
    'Unknown'
]

N_SAMPLES = 5000

# --- 2. Data Generation Functions ---

def generate_kra_pin():
    """Generates a synthetic KRA PIN format A123456789X"""
    return f"A{random.randint(100000000, 999999999)}X"

def generate_income():
    """Generates skewed income based on a log-normal distribution for realism (KES)"""
    # Majority of Kenyans earn below 1M KES/year. Use log-normal distribution.
    return np.exp(np.random.normal(loc=12.5, scale=1.5, size=N_SAMPLES))

def calculate_synthetic_tax(income, compliance_score):
    """Calculates a tax value that reflects the income and compliance score (simulated)"""
    # Base rate logic based on recent KRA bands (simplified for simulation)
    # A base tax rate that increases with income.
    base_tax = np.where(income < 500000, income * 0.15, 
                        np.where(income < 2000000, income * 0.25, income * 0.30))
    
    # Non-compliant taxpayers pay less tax than expected
    # Score 5 (High Compliance) means they pay close to the base rate.
    # Score 1 (Low Compliance) means they pay significantly less.
    tax_factor = compliance_score / 5.0 
    
    # Introduce small noise
    tax = base_tax * tax_factor * np.random.uniform(0.9, 1.1)
    
    # Ensure tax paid doesn't exceed 35% of income and is not negative
    return np.clip(tax, 0, income * 0.35)


# --- 3. Generate DataFrame Columns ---

data = pd.DataFrame({
    # Identifiers
    'Taxpayer_ID': [generate_kra_pin() for _ in range(N_SAMPLES)],
    
    # Demographic Features
    'Age': np.random.randint(18, 85, N_SAMPLES),
    'Gender': np.random.choice(['Male', 'Female', 'Other'], size=N_SAMPLES, p=[0.52, 0.47, 0.01]),
    'Region': np.random.choice(list(KENYAN_COUNTIES.keys()), size=N_SAMPLES, p=list(KENYAN_COUNTIES.values())),
    'Marital_Status': np.random.choice(['Single', 'Married', 'Divorced/Widowed'], size=N_SAMPLES, p=[0.40, 0.55, 0.05]),
    'Number_of_Dependants': np.random.randint(0, 8, N_SAMPLES),
    
    # Financial and Behavioral Features
    'Annual_Gross_Income (KES)': generate_income(),
    'Source_of_Income_Raw': np.random.choice(INCOME_SOURCES, size=N_SAMPLES, p=[0.40, 0.25, 0.10, 0.05, 0.05, 0.15]),
    'Business_Sector': np.random.choice(BUSINESS_SECTORS, size=N_SAMPLES, p=[0.30, 0.20, 0.10, 0.10, 0.05, 0.05, 0.20]),
    'Tax_Compliance_Score': np.random.randint(1, 6, N_SAMPLES), # 1 (Low) to 5 (High)
    'Compliance_Window_Deviation': np.random.lognormal(mean=1.0, sigma=1.0, size=N_SAMPLES).round().astype(int) # Skewed lateness
})

# --- 4. Feature Engineering and Data Corruption (Preprocessing Practice) ---

# 4.1 Create Core Financial Feature: Total Tax Paid
data['Total_Tax_Paid (KES)'] = calculate_synthetic_tax(data['Annual_Gross_Income (KES)'].values, data['Tax_Compliance_Score'].values)

# 4.2 Introduce Synthetic Outliers for Annual_Gross_Income
# 5 very high earners (High Net Worth Individuals)
data.loc[random.sample(range(N_SAMPLES), 5), 'Annual_Gross_Income (KES)'] = np.random.uniform(50_000_000, 500_000_000, 5) 

# 4.3 Introduce Missing Values (NaN)
data.loc[data.sample(frac=0.08).index, 'Age'] = np.nan            # 8% missing in Age (numerical imputation)
data.loc[data.sample(frac=0.05).index, 'Gender'] = np.nan         # 5% missing in Gender (categorical imputation)
data.loc[data.sample(frac=0.10).index, 'Marital_Status'] = np.nan # 10% missing in Marital_Status

# 4.4 Introduce 'Dirty' Data (Custom cleaning challenge)
# Age Outliers/Errors
data.loc[data.sample(frac=0.01).index, 'Age'] = np.random.choice([120, 150, -5], size=int(N_SAMPLES * 0.01)) 

# 4.5 Feature Engineering: Effective Tax Rate (Requires handling division by zero)
# The calculation should happen AFTER the main financial columns are set
data['Effective_Tax_Rate'] = (data['Total_Tax_Paid (KES)'] / data['Annual_Gross_Income (KES)']) * 100

# Cap compliance deviation outliers
data['Compliance_Window_Deviation'] = np.clip(data['Compliance_Window_Deviation'], 0, 365)


# --- 5. Final Structure and Save ---

# Select and re-order columns
dataset = data[[
    'Taxpayer_ID', 'Age', 'Gender', 'Region', 'Marital_Status', 'Number_of_Dependants',
    'Annual_Gross_Income (KES)', 'Total_Tax_Paid (KES)', 'Source_of_Income_Raw', 'Business_Sector', 
    'Tax_Compliance_Score', 'Compliance_Window_Deviation', 'Effective_Tax_Rate'
]]

# Reset index and display first few rows
print(f"ðŸŽ‰ Generated a synthetic Kenyan Taxpayer dataset with {N_SAMPLES} records.")
print("\n--- First 5 Rows ---")
print(dataset.head())

# Save the dataset to a CSV file
dataset.to_csv('kenyan_taxpayer_segmentation_data.csv', index=False)
print("\nDataset saved as 'kenyan_taxpayer_segmentation_data.csv'")

ðŸŽ‰ Generated a synthetic Kenyan Taxpayer dataset with 5000 records.

--- First 5 Rows ---
   Taxpayer_ID   Age  Gender          Region Marital_Status  \
0  A786579303X  69.0    Male          Kisumu         Single   
1  A219540831X  32.0    Male          Kisumu         Single   
2  A126855092X  78.0    Male         Nairobi        Married   
3  A896233790X  38.0    Male  Rural_SemiArid        Married   
4  A395310485X  41.0  Female   Rural_Uplands        Married   

   Number_of_Dependants  Annual_Gross_Income (KES)  Total_Tax_Paid (KES)  \
0                     5               3.379900e+05          10654.836532   
1                     7               1.226257e+05           7731.336589   
2                     2               1.671357e+06         351253.801195   
3                     5               9.729017e+04           6133.974197   
4                     6               1.582908e+05           9979.958373   

         Source_of_Income_Raw        Business_Sector  Tax_Compliance_Sco