In [12]:
# Installing faker library
!pip install faker



In [13]:
# Imports
import pandas as pd
import numpy as np
from faker import Faker
import random
import string
from IPython.display import display

In [14]:
# Initialising Faker
fake = Faker()

In [15]:
# Constants
n_records = 100000

In [16]:
# Helper functions
def generate_customer_id():
    letters = ''.join(random.choices(string.ascii_uppercase, k=3))
    numbers = ''.join(random.choices(string.digits, k=6))
    return letters + numbers

def generate_customer_category():
    return random.choice(['Corporation', 'Individual', 'Small Business',
                          'Sole Proprietorship', 'LLC'])

def generate_contract_status():
    return random.choice(['Closed', 'Expired', 'Active'])

def generate_payment_status():
    return random.choice(['Delinquent', 'Current', 'Recovered'])

def generate_payment_interval():
    return random.choice(['Monthly', 'Quarterly', 'Annually'])

In [17]:
# Generating data
data = {
    'Customer_ID': [generate_customer_id() for _ in range(n_records)],
    'Contract_Term': [random.randint(12, 60) for _ in range(n_records)],
    # Contract duration between 12 and 60 months
    'Contract_Start_Date': [fake.date_between(start_date='-5y',
                                              end_date='today') for _ in range(
                                                  n_records)],
    # Contracts within the last 5 years
    'Cost_Amount_GBP': [round(random.uniform(5000, 50000),
                              2) for _ in range(n_records)],
    # Random cost between 5,000 and 50,000 GBP
    'Regulatory_Compliance': [random.choice(['Yes',
                                             'No']) for _ in range(n_records)],
    # Regulatory compliance
    'Customer_Category': [generate_customer_category() for _ in range(
        n_records)],
    'Exposure_Amount_GBP': [round(random.uniform(500, 30000), 2) for _ in range(
        n_records)], # Exposure amount between 500 and 30000 GBP
    'Contract_Status': [generate_contract_status() for _ in range(n_records)],
    'Assistance_Flag': [random.choice(['Yes', 'No']) for _ in range(n_records)],
    # Whether financial assistance was provided
    'Risk_Flag': [random.choice(['Yes', 'No']) for _ in range(n_records)],
    # Whether the customer is considered high risk
    'Payment_Status': [generate_payment_status() for _ in range(n_records)],
    'Forbearance_Amount_GBP': [round(random.uniform(0, 5000), 2) for _ in range(
        n_records)],
    'Payment_Interval': [generate_payment_interval() for _ in range(n_records)],
    # Payment interval (e.g., monthly)
    'Late_Payment_Fees_GBP': [round(random.uniform(0, 1000), 2) for _ in range(
        n_records)],
    'Total_Arrears_GBP': [round(random.uniform(0, 20000), 2) for _ in range(
        n_records)]
}

In [18]:
# Creating DataFrame
df = pd.DataFrame(data)

In [19]:
# Calculating Contract End Date based on Contract Start Date and Contract Term
df['Contract_End_Date'] = df.apply(lambda row: row[
    'Contract_Start_Date'] + pd.DateOffset(months=row['Contract_Term']), axis=1)

# Inserting Contract End Date after Contract Start Date
contract_start_index = df.columns.get_loc('Contract_Start_Date')
df.insert(
    contract_start_index + 1, 'Contract_End_Date', df.pop('Contract_End_Date'))

In [20]:
# Displaying df and summary
display(df.head())
display(df.info())

Unnamed: 0,Customer_ID,Contract_Term,Contract_Start_Date,Contract_End_Date,Cost_Amount_GBP,Regulatory_Compliance,Customer_Category,Exposure_Amount_GBP,Contract_Status,Assistance_Flag,Risk_Flag,Payment_Status,Forbearance_Amount_GBP,Payment_Interval,Late_Payment_Fees_GBP,Total_Arrears_GBP
0,IRD028644,52,2020-08-10,2024-12-10,36490.82,No,Corporation,23280.86,Expired,No,No,Current,1479.19,Annually,915.66,1517.72
1,JQW659845,47,2024-05-05,2028-04-05,14841.72,Yes,Individual,13655.73,Closed,Yes,No,Current,3297.25,Annually,885.44,232.85
2,PGA833046,35,2021-01-16,2023-12-16,18721.1,Yes,LLC,9959.53,Active,No,Yes,Current,810.77,Annually,497.17,7139.78
3,TOK138101,41,2024-08-08,2028-01-08,25319.07,No,Individual,28506.77,Closed,No,No,Recovered,2556.06,Annually,941.63,15546.67
4,RFS817524,45,2024-01-28,2027-10-28,10644.33,No,LLC,12381.26,Expired,No,No,Delinquent,3446.34,Monthly,907.7,10218.63


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   Customer_ID             100000 non-null  object        
 1   Contract_Term           100000 non-null  int64         
 2   Contract_Start_Date     100000 non-null  object        
 3   Contract_End_Date       100000 non-null  datetime64[ns]
 4   Cost_Amount_GBP         100000 non-null  float64       
 5   Regulatory_Compliance   100000 non-null  object        
 6   Customer_Category       100000 non-null  object        
 7   Exposure_Amount_GBP     100000 non-null  float64       
 8   Contract_Status         100000 non-null  object        
 9   Assistance_Flag         100000 non-null  object        
 10  Risk_Flag               100000 non-null  object        
 11  Payment_Status          100000 non-null  object        
 12  Forbearance_Amount_GBP  100000 

None

In [21]:
# Saving df to pkl
df.to_pickle('arrears_data.pkl')