In [51]:
# Installing faker library
!pip install faker



In [52]:
# Imports
import pandas as pd
import numpy as np
from faker import Faker
import random
import string

In [53]:
# Initialising Faker
fake = Faker()

In [54]:
# Constants
n1_records = 700000

In [55]:
# Helper functions
def generate_customer_id():
    letters = ''.join(random.choices(string.ascii_uppercase, k=3))
    numbers = ''.join(random.choices(string.digits, k=6))
    return letters + numbers

def generate_customer_category():
    return random.choice(['Corporation', 'Individual', 'Small Business', 'Sole Proprietorship', 'LLC'])

def generate_contract_status():
    return random.choice(['Closed', 'Expired', 'Active'])

def generate_payment_status():
    return random.choice(['Delinquent', 'Current', 'Recovered'])

def generate_payment_interval():
    return random.choice(['Monthly', 'Quarterly', 'Annually'])

In [56]:
# Generating 1st data
data_1 = {
    'Customer_ID': [generate_customer_id() for _ in range(n1_records)],
    'Contract_Term': [random.randint(12, 96) for _ in range(n1_records)],
    'Contract_Start_Date': [fake.date_between(start_date='-16y', end_date='-1y') for _ in range(n1_records)],
    'Contract_End_Date': [fake.date_between(start_date='-3y', end_date='-1y') for _ in range(n1_records)],
    'Cost_Amount_GBP': [round(random.uniform(1000, 900900), 2) for _ in range(n1_records)],
    'Regulatory_Compliance': [random.choice(['Yes', 'No']) for _ in range(n1_records)],
    'Customer_Category': [generate_customer_category() for _ in range(n1_records)],
    'Exposure_Amount_GBP': [round(random.uniform(1000, 69000), 2) for _ in range(n1_records)],
    'Contract_Status': [generate_contract_status() for _ in range(n1_records)],
    'Assistance_Flag': [random.choice(['Yes', 'No']) for _ in range(n1_records)],
    'Risk_Flag': [random.choice(['Yes', 'No']) for _ in range(n1_records)],
    'Payment_Status': [generate_payment_status() for _ in range(n1_records)],
    'Forbearance_Amount_GBP': [round(random.uniform(0, 2000), 2) for _ in range(n1_records)],
    'Payment_Interval': [generate_payment_interval() for _ in range(n1_records)],
    'Late_Payment_Fees_GBP': [round(random.uniform(0, 990), 2) for _ in range(n1_records)],
    'Total_Arrears_GBP': [round(random.uniform(0, 19700), 2) for _ in range(n1_records)]
}

In [57]:
# Creating DataFrame 1
df_1 = pd.DataFrame(data_1)

In [58]:
# Displaying sample df_1 and summary
print(df_1.head())
print(df_1.info())

  Customer_ID  Contract_Term Contract_Start_Date Contract_End_Date  \
0   EJR872837             54          2015-07-21        2021-12-20   
1   IMX979868             91          2020-06-06        2021-09-05   
2   NQZ020781             90          2016-06-02        2023-05-23   
3   ZJE848049             43          2008-08-20        2023-01-12   
4   HYT626213             37          2017-06-28        2022-01-18   

   Cost_Amount_GBP Regulatory_Compliance Customer_Category  \
0        422562.25                    No       Corporation   
1        215224.06                    No       Corporation   
2        294362.39                    No       Corporation   
3         91318.00                   Yes        Individual   
4        809735.65                    No        Individual   

   Exposure_Amount_GBP Contract_Status Assistance_Flag Risk_Flag  \
0             17152.49          Closed             Yes        No   
1             25252.94         Expired             Yes       Yes   
2 

In [59]:
# Saving df_1 to pkl
df_1.to_pickle('historical_arrears_data_1.pkl')

In [60]:
# Constants
n2_records = 700000

In [61]:
# Helper functions
def generate_customer_id():
    letters = ''.join(random.choices(string.ascii_uppercase, k=3))
    numbers = ''.join(random.choices(string.digits, k=6))
    return letters + numbers

def generate_customer_category():
    return random.choice(['Corporation', 'Individual', 'Small Business', 'Sole Proprietorship', 'LLC'])

def generate_contract_status():
    return random.choice(['Closed', 'Expired', 'Active'])

def generate_payment_status():
    return random.choice(['Delinquent', 'Current', 'Recovered'])

def generate_payment_interval():
    return random.choice(['Monthly', 'Quarterly', 'Annually'])

In [62]:
# Function to introduce missing values
def introduce_missing_values(data, percentage):
    total_values = data.size
    n_missing = int(total_values * percentage)
    for _ in range(n_missing):
        ix = (random.randint(0, data.shape[0] - 1), random.randint(0, data.shape[1] - 1))
        data.iat[ix] = np.nan
    return data

In [63]:
# Function to introduce bias
def introduce_bias(data, column, bias_value, bias_percentage):
    n_bias = int(data.shape[0] * bias_percentage)
    indices = random.sample(range(data.shape[0]), n_bias)
    for i in indices:
        data.at[i, column] = bias_value
    return data

In [64]:
# Function to remove or decrease bias
def remove_bias(data, column, bias_value, remove_percentage):
    n_remove = int(data.shape[0] * remove_percentage)
    bias_indices = data[data[column] == bias_value].index.tolist()
    indices_to_remove = random.sample(bias_indices, min(n_remove, len(bias_indices)))
    for i in indices_to_remove:
        data.at[i, column] = np.nan
    return data

In [65]:
# Function to introduce skewness
def introduce_skew(data, column, skew_type='positive'):
    if skew_type == 'positive':
        skewed_data = data[column] + np.random.exponential(scale=5000, size=data.shape[0])
    elif skew_type == 'negative':
        skewed_data = data[column] - np.random.exponential(scale=5000, size=data.shape[0])
    data[column] = skewed_data
    return data

In [66]:
# Function to introduce outliers
def introduce_outliers(data, column, outlier_percentage=0.01):
    n_outliers = int(data.shape[0] * outlier_percentage)
    indices = random.sample(range(data.shape[0]), n_outliers)
    data.loc[indices, column] = data[column].max() * 10
    return data

In [67]:
# Function to add random noise
def introduce_random_noise(data, column, noise_level=0.1):
    noise = np.random.normal(0, data[column].std() * noise_level, size=data.shape[0])
    data[column] = data[column] + noise
    return data

In [68]:
# Generating 2nd data with kinds of bias
data_2 = {
    'Customer_ID': [generate_customer_id() for _ in range(n2_records)],
    'Contract_Term': [random.randint(12, 96) for _ in range(n2_records)],
    'Contract_Start_Date': [fake.date_between(start_date='-16y', end_date='-1y') for _ in range(n2_records)],
    'Contract_End_Date': [fake.date_between(start_date='-3y', end_date='-1y') for _ in range(n2_records)],
    'Cost_Amount_GBP': [round(random.uniform(1000, 900900), 2) for _ in range(n2_records)],
    'Regulatory_Compliance': [random.choice(['Yes', 'No']) for _ in range(n2_records)],
    'Customer_Category': [generate_customer_category() for _ in range(n2_records)],
    'Exposure_Amount_GBP': [round(random.uniform(1000, 69000), 2) for _ in range(n2_records)],
    'Contract_Status': [generate_contract_status() for _ in range(n2_records)],
    'Assistance_Flag': [random.choice(['Yes', 'No']) for _ in range(n2_records)],
    'Risk_Flag': [random.choice(['Yes', 'No']) for _ in range(n2_records)],
    'Payment_Status': [generate_payment_status() for _ in range(n2_records)],
    'Forbearance_Amount_GBP': [round(random.uniform(0, 2000), 2) for _ in range(n2_records)],
    'Payment_Interval': [generate_payment_interval() for _ in range(n2_records)],
    'Late_Payment_Fees_GBP': [round(random.uniform(0, 990), 2) for _ in range(n2_records)],
    'Total_Arrears_GBP': [round(random.uniform(0, 19700), 2) for _ in range(n2_records)]
}

In [69]:
# Creating DataFrame 2
df_2 = pd.DataFrame(data_2)

In [70]:
# Introducing missing values in 5%
df_2 = introduce_missing_values(df_2, 0.05)

In [71]:
# Introducing bias in 'Customer_Category' column, making 'Individual' appear more frequently
df_2 = introduce_bias(df_2, 'Customer_Category', 'Individual', 0.20)

In [72]:
# Replacing 'Quarterly' with 'NaN'
df_2 = remove_bias(df_2, 'Payment_Interval', 'Quarterly', 0.10)

In [73]:
# Introducing some distortion in 'Forbearance_Amount_GBP'
df_2 = introduce_random_noise(df_2, 'Forbearance_Amount_GBP', noise_level=0.1)

In [74]:
# Introduce positive skewness to the 'Cost_Amount_GBP' column
df_2 = introduce_skew(df_2, 'Cost_Amount_GBP', skew_type='positive')

In [75]:
# Introduce outliers to 'Total_Arrears_GBP' column
df_2 = introduce_outliers(df_2, 'Total_Arrears_GBP', outlier_percentage=0.07)

In [76]:
# Displaying sample df_2 and summary
print(df_2.head())
print(df_2.info())

  Customer_ID  Contract_Term Contract_Start_Date Contract_End_Date  \
0   JZM603287           64.0          2010-08-05        2021-08-22   
1   JWK625710           67.0          2018-01-04        2022-10-16   
2   BQD652277           43.0          2020-04-25        2023-08-02   
3   RQJ989396           51.0          2013-08-17        2023-02-27   
4   XUJ883199           66.0          2019-11-15        2023-01-23   

   Cost_Amount_GBP Regulatory_Compliance    Customer_Category  \
0    339782.683478                    No           Individual   
1    270733.814649                    No          Corporation   
2    347307.503340                   Yes                  LLC   
3    648842.497319                   Yes       Small Business   
4    453215.110767                    No  Sole Proprietorship   

   Exposure_Amount_GBP Contract_Status Assistance_Flag Risk_Flag  \
0             61302.26          Active              No       Yes   
1             26155.05          Active              

In [77]:
# Saving df_2 to pkl
df_2.to_pickle('historical_arrears_data_2.pkl')