In [None]:
# Imports
import pandas as pd
import numpy as np

In [None]:
# Loading df
df = pd.read_pickle('arrears_data.pkl')

In [None]:
# Displaying & checking df
df.head()
df.info()

  Customer_ID  Contract_Term Contract_Start_Date Contract_End_Date  \
0   FGM266208             24          2021-10-24        2023-10-24   
1   SMG503835             16          2022-04-08        2023-08-08   
2   LLO450747             12          2021-11-06        2022-11-06   
3   ZZQ704582             16          2020-11-11        2022-03-11   
4   ZIS532090             30          2023-04-17        2025-10-17   

   Cost_Amount_GBP Regulatory_Compliance    Customer_Category  \
0          5996.88                   Yes                  LLC   
1         33732.95                    No  Sole Proprietorship   
2         28786.90                    No  Sole Proprietorship   
3         46503.75                    No  Sole Proprietorship   
4         32622.67                   Yes          Corporation   

   Exposure_Amount_GBP Contract_Status Assistance_Flag Risk_Flag  \
0              2804.97          Active             Yes        No   
1             13383.33          Closed             Y

In [None]:
# Identifying missing values
df.isnull().sum()
df.duplicated().sum()

Customer_ID               0
Contract_Term             0
Contract_Start_Date       0
Contract_End_Date         0
Cost_Amount_GBP           0
Regulatory_Compliance     0
Customer_Category         0
Exposure_Amount_GBP       0
Contract_Status           0
Assistance_Flag           0
Risk_Flag                 0
Payment_Status            0
Forbearance_Amount_GBP    0
Payment_Interval          0
Late_Payment_Fees_GBP     0
Total_Arrears_GBP         0
dtype: int64

0


In [None]:
# Converting numeric columns to the correct data type
numeric_columns = ['Contract_Term', 'Cost_Amount_GBP', 'Exposure_Amount_GBP', 'Forbearance_Amount_GBP', 'Late_Payment_Fees_GBP', 'Total_Arrears_GBP']

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col])

In [None]:
# Converting Contract_Term to int64
df['Contract_Term'] = df['Contract_Term'].astype('int64')

In [None]:
# Ensuring categorical data is consistent
categorical_columns = ['Regulatory_Compliance', 'Customer_Category', 'Contract_Status', 'Assistance_Flag', 'Risk_Flag', 'Payment_Status', 'Payment_Interval']

for col in categorical_columns:
    df[col] = df[col].astype('category')

In [None]:
# Dropping unwanted columns
df.drop(columns=['Contract_Start_Date', 'Contract_End_Date'], inplace=True)

In [None]:
# Final check on data types and missing values
df.head()
df.info()

  Customer_ID  Contract_Term  Cost_Amount_GBP Regulatory_Compliance  \
0   FGM266208             24          5996.88                   Yes   
1   SMG503835             16         33732.95                    No   
2   LLO450747             12         28786.90                    No   
3   ZZQ704582             16         46503.75                    No   
4   ZIS532090             30         32622.67                   Yes   

     Customer_Category  Exposure_Amount_GBP Contract_Status Assistance_Flag  \
0                  LLC              2804.97          Active             Yes   
1  Sole Proprietorship             13383.33          Closed             Yes   
2  Sole Proprietorship             20726.77         Expired             Yes   
3  Sole Proprietorship             14817.61          Closed              No   
4          Corporation             14472.43          Active             Yes   

  Risk_Flag Payment_Status  Forbearance_Amount_GBP Payment_Interval  \
0        No      Recovered 

In [None]:
# Saving as cleaned datasets in pickle file to data structure and types
df.to_pickle('cleaned_arrears_data.pkl')