In [23]:
!pip install faker



In [136]:
# Imports
import pandas as pd
import numpy as np
from faker import Faker
import random
import string

In [137]:
# Initialising Faker
fake = Faker()

In [138]:
# Constants
n1_records = 700000

In [139]:
# Helper functions
def generate_customer_id():
    letters = ''.join(random.choices(string.ascii_uppercase, k=3))
    numbers = ''.join(random.choices(string.digits, k=6))
    return letters + numbers

def generate_customer_category():
    return random.choice(['Corporation', 'Individual', 'Small Business', 'Sole Proprietorship', 'LLC'])

def generate_contract_status():
    return random.choice(['Closed', 'Expired', 'Active'])

def generate_payment_status():
    return random.choice(['Delinquent', 'Current', 'Recovered'])

def generate_payment_interval():
    return random.choice(['Monthly', 'Quarterly', 'Annual'])

In [140]:
# Generating data
data_1 = {
    'Customer_ID': [generate_customer_id() for _ in range(n1_records)],
    'Contract_Term': [random.randint(12, 96) for _ in range(n1_records)],
    'Contract_Start_Date': [fake.date_between(start_date='-16y', end_date='-1y') for _ in range(n1_records)],
    'Contract_End_Date': [fake.date_between(start_date='-3y', end_date='-1y') for _ in range(n1_records)],
    'Cost_Amount_GBP': [round(random.uniform(1000, 900900), 2) for _ in range(n1_records)],
    'Regulatory_Compliance': [random.choice(['Yes', 'No']) for _ in range(n1_records)],
    'Customer_Category': [generate_customer_category() for _ in range(n1_records)],
    'Exposure_Amount_GBP': [round(random.uniform(1000, 69000), 2) for _ in range(n1_records)],
    'Contract_Status': [generate_contract_status() for _ in range(n1_records)],
    'Assistance_Flag': [random.choice(['Yes', 'No']) for _ in range(n1_records)],
    'Risk_Flag': [random.choice(['Yes', 'No']) for _ in range(n1_records)],
    'Payment_Status': [generate_payment_status() for _ in range(n1_records)],
    'Forbearance_Amount_GBP': [round(random.uniform(0, 2000), 2) for _ in range(n1_records)],
    'Payment_Interval': [generate_payment_interval() for _ in range(n1_records)],
    'Late_Payment_Fees_GBP': [round(random.uniform(0, 990), 2) for _ in range(n1_records)],
    'Total_Arrears_GBP': [round(random.uniform(0, 19700), 2) for _ in range(n1_records)]
}

In [141]:
# Creating DataFrame
df_1 = pd.DataFrame(data_1)

In [142]:
# Displaying sample data_1 and summary
print(df_1.head())
print(df_1.info())

  Customer_ID  Contract_Term Contract_Start_Date Contract_End_Date  \
0   CCO906427             49          2009-07-17        2023-04-10   
1   VDS982769             21          2010-06-06        2021-08-21   
2   KEI061466             89          2021-05-10        2022-12-21   
3   WZR191205             31          2016-11-19        2023-06-03   
4   AVI396485             22          2022-07-22        2023-06-22   

   Cost_Amount_GBP Regulatory_Compliance    Customer_Category  \
0        768588.66                    No       Small Business   
1        783761.81                    No  Sole Proprietorship   
2        347810.22                   Yes       Small Business   
3        364426.31                    No          Corporation   
4        821136.75                    No  Sole Proprietorship   

   Exposure_Amount_GBP Contract_Status Assistance_Flag Risk_Flag  \
0             36938.00         Expired             Yes        No   
1             23580.01          Active             Y

In [143]:
# Saving to CSV
df_1.to_csv('historical_arrears_data_1.csv', index=False)

In [144]:
# Constants
n2_records = 700000

In [145]:
# Helper functions
def generate_customer_id():
    letters = ''.join(random.choices(string.ascii_uppercase, k=3))
    numbers = ''.join(random.choices(string.digits, k=6))
    return letters + numbers

def generate_customer_category():
    return random.choice(['Corporation', 'Individual', 'Small Business', 'Sole Proprietorship', 'LLC'])

def generate_contract_status():
    return random.choice(['Closed', 'Expired', 'Active'])

def generate_payment_status():
    return random.choice(['Delinquent', 'Current', 'Recovered'])

def generate_payment_interval():
    return random.choice(['Monthly', 'Quarterly', 'Annual'])

In [146]:
# Function to introduce missing values
def introduce_missing_values(data, percentage):
    total_values = data.size
    n_missing = int(total_values * percentage)
    for _ in range(n_missing):
        ix = (random.randint(0, data.shape[0] - 1), random.randint(0, data.shape[1] - 1))
        data.iat[ix] = np.nan
    return data

In [147]:
# Function to introduce bias
def introduce_bias(data, column, bias_value, bias_percentage):
    n_bias = int(data.shape[0] * bias_percentage)
    indices = random.sample(range(data.shape[0]), n_bias)
    for i in indices:
        data.at[i, column] = bias_value
    return data

# # Function to introduce bias
# def introduce_bias(data, column, bias_value, bias_percentage):
#     n_bias = int(data.shape[0] * bias_percentage)
#     indices = random.sample(range(data.shape[0]), n_bias)
#     data.loc[indices, column] = bias_value
#     return data

In [148]:
# Generating data with bias
data_2 = {
    'Customer_ID': [generate_customer_id() for _ in range(n2_records)],
    'Contract_Term': [random.randint(12, 96) for _ in range(n2_records)],
    'Contract_Start_Date': [fake.date_between(start_date='-16y', end_date='-1y') for _ in range(n2_records)],
    'Contract_End_Date': [fake.date_between(start_date='-3y', end_date='-1y') for _ in range(n2_records)],
    'Cost_Amount_GBP': [round(random.uniform(1000, 900900), 2) for _ in range(n2_records)],
    'Regulatory_Compliance': [random.choice(['Yes', 'No']) for _ in range(n2_records)],
    'Customer_Category': [generate_customer_category() for _ in range(n2_records)],
    'Exposure_Amount_GBP': [round(random.uniform(1000, 69000), 2) for _ in range(n2_records)],
    'Contract_Status': [generate_contract_status() for _ in range(n2_records)],
    'Assistance_Flag': [random.choice(['Yes', 'No']) for _ in range(n2_records)],
    'Risk_Flag': [random.choice(['Yes', 'No']) for _ in range(n2_records)],
    'Payment_Status': [generate_payment_status() for _ in range(n2_records)],
    'Forbearance_Amount_GBP': [round(random.uniform(0, 2000), 2) for _ in range(n2_records)],
    'Payment_Interval': [generate_payment_interval() for _ in range(n2_records)],
    'Late_Payment_Fees_GBP': [round(random.uniform(0, 990), 2) for _ in range(n2_records)],
    'Total_Arrears_GBP': [round(random.uniform(0, 19700), 2) for _ in range(n2_records)]
}

In [149]:
# Creating DataFrame
df_2 = pd.DataFrame(data_2)

In [150]:
# Introducing missing values in 5% of the data_2
df_2 = introduce_missing_values(df_2, 0.05)

In [151]:
# Introducing bias in 'Customer_Category' column, making 'Individual' appear more frequently
df_2 = introduce_bias(df_2, 'Customer_Category', 'Individual', 0.20)

In [152]:
# Introducing some distortion in 'Cost_Amount_GBP' by adding random noise
noise = np.random.normal(0, 1000, df_2.shape[0])
df_2['Cost_Amount_GBP'] = df_2['Cost_Amount_GBP'] + noise

In [153]:
# Displaying sample data_2 and summary
print(df_2.head())
print(df_2.info())

  Customer_ID  Contract_Term Contract_Start_Date Contract_End_Date  \
0   DAO868056           57.0          2013-06-04        2022-09-14   
1   DFH727953           23.0          2010-07-26               NaN   
2   MZS642561           45.0          2013-09-28        2022-04-07   
3         NaN           47.0          2011-05-23        2022-07-28   
4   EOR960450           91.0          2012-04-15        2022-05-16   

   Cost_Amount_GBP Regulatory_Compliance    Customer_Category  \
0    646506.929448                   Yes           Individual   
1    685775.023289                    No  Sole Proprietorship   
2     16855.773178                   Yes       Small Business   
3    183961.557988                   Yes           Individual   
4    523991.596824                    No           Individual   

   Exposure_Amount_GBP Contract_Status Assistance_Flag Risk_Flag  \
0                  NaN          Active             Yes        No   
1             55376.14          Active              

In [154]:
# Saving to CSV
df_2.to_csv('historical_arrears_data_2.csv', index=False)

In [157]:
from google.colab import files
files.download(f"./historical_arrears_data_1.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [158]:
from google.colab import files
files.download(f"./historical_arrears_data_2.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [164]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [165]:
!git clone https://github.com/Tengey/Arrears.git

Cloning into 'Arrears'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects:  14% (1/7)[Kremote: Counting objects:  28% (2/7)[Kremote: Counting objects:  42% (3/7)[Kremote: Counting objects:  57% (4/7)[Kremote: Counting objects:  71% (5/7)[Kremote: Counting objects:  85% (6/7)[Kremote: Counting objects: 100% (7/7)[Kremote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects:  20% (1/5)[Kremote: Compressing objects:  40% (2/5)[Kremote: Compressing objects:  60% (3/5)[Kremote: Compressing objects:  80% (4/5)[Kremote: Compressing objects: 100% (5/5)[Kremote: Compressing objects: 100% (5/5), done.[K
remote: Total 7 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects:  14% (1/7)Receiving objects:  28% (2/7)Receiving objects:  42% (3/7)Receiving objects:  57% (4/7)Receiving objects:  71% (5/7)Receiving objects:  85% (6/7)Receiving objects: 100% (7/7)Receiving objects: 100% (7/7), done.


In [166]:
%cd Arrears

/content/Arrears/Arrears


In [169]:
%pwd

'/content/Arrears/Arrears'

In [170]:
!ls -al

total 16
drwxr-xr-x 3 root root 4096 Jul 17 00:10 .
drwxr-xr-x 4 root root 4096 Jul 17 00:10 ..
drwxr-xr-x 8 root root 4096 Jul 17 00:12 .git
-rw-r--r-- 1 root root 2162 Jul 17 00:10 README.md
