In [56]:
# Installing faker library
!pip install faker



In [57]:
# Imports
import pandas as pd
import numpy as np
from faker import Faker
import random
import string

In [58]:
# Initialising Faker
fake = Faker()

In [59]:
# Constants
n_records = 100000

In [60]:
# Helper functions
def generate_customer_id():
    letters = ''.join(random.choices(string.ascii_uppercase, k=3))
    numbers = ''.join(random.choices(string.digits, k=6))
    return letters + numbers

def generate_customer_category():
    return random.choice(['Corporation', 'Individual', 'Small Business', 'Sole Proprietorship', 'LLC'])

def generate_contract_status():
    return random.choice(['Closed', 'Expired', 'Active'])

def generate_payment_status():
    return random.choice(['Delinquent', 'Current', 'Recovered'])

def generate_payment_interval():
    return random.choice(['Monthly', 'Quarterly', 'Annually'])

In [61]:
# Generating data
data = {
    'Customer_ID': [generate_customer_id() for _ in range(n_records)],
    'Contract_Term': [random.randint(12, 60) for _ in range(n_records)],  # Contract duration between 12 and 60 months
    'Contract_Start_Date': [fake.date_between(start_date='-5y', end_date='today') for _ in range(n_records)], # Contracts within the last 5 years
    'Cost_Amount_GBP': [round(random.uniform(5000, 50000), 2) for _ in range(n_records)], # Random cost between 5,000 and 50,000 GBP
    'Regulatory_Compliance': [random.choice(['Yes', 'No']) for _ in range(n_records)], # Regulatory compliance
    'Customer_Category': [generate_customer_category() for _ in range(n_records)],
    'Exposure_Amount_GBP': [round(random.uniform(500, 30000), 2) for _ in range(n_records)], # Exposure amount between 500 and 30000 GBP
    'Contract_Status': [generate_contract_status() for _ in range(n_records)],
    'Assistance_Flag': [random.choice(['Yes', 'No']) for _ in range(n_records)], # Whether financial assistance was provided
    'Risk_Flag': [random.choice(['Yes', 'No']) for _ in range(n_records)], # Whether the customer is considered high risk
    'Payment_Status': [generate_payment_status() for _ in range(n_records)],
    'Forbearance_Amount_GBP': [round(random.uniform(0, 5000), 2) for _ in range(n_records)],
    'Payment_Interval': [generate_payment_interval() for _ in range(n_records)], # Payment interval (e.g., monthly)
    'Late_Payment_Fees_GBP': [round(random.uniform(0, 1000), 2) for _ in range(n_records)],
    'Total_Arrears_GBP': [round(random.uniform(0, 20000), 2) for _ in range(n_records)]
}

In [62]:
# Creating DataFrame
df = pd.DataFrame(data)

In [63]:
# Calculating Contract End Date based on Contract Start Date and Contract Term
df['Contract_End_Date'] = df.apply(lambda row: row['Contract_Start_Date'] + pd.DateOffset(months=row['Contract_Term']), axis=1)

# Inserting Contract End Date after Contract Start Date
contract_start_index = df.columns.get_loc('Contract_Start_Date')
df.insert(contract_start_index + 1, 'Contract_End_Date', df.pop('Contract_End_Date'))

In [64]:
# Displaying df and summary
df.head()
df.info()

  Customer_ID  Contract_Term Contract_Start_Date Contract_End_Date  \
0   FGM266208             24          2021-10-24        2023-10-24   
1   SMG503835             16          2022-04-08        2023-08-08   
2   LLO450747             12          2021-11-06        2022-11-06   
3   ZZQ704582             16          2020-11-11        2022-03-11   
4   ZIS532090             30          2023-04-17        2025-10-17   

   Cost_Amount_GBP Regulatory_Compliance    Customer_Category  \
0          5996.88                   Yes                  LLC   
1         33732.95                    No  Sole Proprietorship   
2         28786.90                    No  Sole Proprietorship   
3         46503.75                    No  Sole Proprietorship   
4         32622.67                   Yes          Corporation   

   Exposure_Amount_GBP Contract_Status Assistance_Flag Risk_Flag  \
0              2804.97          Active             Yes        No   
1             13383.33          Closed             Y

In [65]:
# Saving df to pkl
df.to_pickle('arrears_data.pkl')