In [39]:
# Imports
import pandas as pd
import numpy as np

In [40]:
# Loading the datasets
df_1 = pd.read_csv('historical_arrears_data_1.csv')
df_2 = pd.read_csv('historical_arrears_data_2.csv')

In [41]:
# Identifying missing values
print(df_1.isnull().sum())
print(df_2.isnull().sum())

Customer_ID               0
Contract_Term             0
Contract_Start_Date       0
Contract_End_Date         0
Cost_Amount_GBP           0
Regulatory_Compliance     0
Customer_Category         0
Exposure_Amount_GBP       0
Contract_Status           0
Assistance_Flag           0
Risk_Flag                 0
Payment_Status            0
Forbearance_Amount_GBP    0
Payment_Interval          0
Late_Payment_Fees_GBP     0
Total_Arrears_GBP         0
dtype: int64
Customer_ID               34265
Contract_Term             34208
Contract_Start_Date       34129
Contract_End_Date         34239
Cost_Amount_GBP           34070
Regulatory_Compliance     34213
Customer_Category         27306
Exposure_Amount_GBP       34044
Contract_Status           34099
Assistance_Flag           34194
Risk_Flag                 34254
Payment_Status            33671
Forbearance_Amount_GBP    34261
Payment_Interval          34402
Late_Payment_Fees_GBP     34013
Total_Arrears_GBP         34145
dtype: int64


In [42]:
# Handling missing Customer_ID values
df_1 = df_1.dropna(subset=['Customer_ID'])
df_2 = df_2.dropna(subset=['Customer_ID'])

In [43]:
# Filling numeric missing values with the median and categorical with the mode
# try only for df_2 since df_1 has zero missing values

for col in df_1.columns:
    if df_1[col].dtype == 'object':
        df_1[col].fillna(df_1[col].mode()[0], inplace=True)
    else:
        df_1[col].fillna(df_1[col].median(), inplace=True)

for col in df_2.columns:
    if df_2[col].dtype == 'object':
        df_2[col].fillna(df_2[col].mode()[0], inplace=True)
    else:
        df_2[col].fillna(df_2[col].median(), inplace=True)

In [44]:
# Converting numeric columns to the correct data type
numeric_columns = ['Contract_Term', 'Cost_Amount_GBP', 'Exposure_Amount_GBP', 'Forbearance_Amount_GBP', 'Late_Payment_Fees_GBP', 'Total_Arrears_GBP']

for col in numeric_columns:
    df_1[col] = pd.to_numeric(df_1[col], errors='coerce')
    df_2[col] = pd.to_numeric(df_2[col], errors='coerce')

In [45]:
# Converting Contract_Term to int64
df_1['Contract_Term'] = df_1['Contract_Term'].astype('int64')
df_2['Contract_Term'] = df_2['Contract_Term'].astype('int64')

In [46]:
# Ensuring categorical data is consistent
categorical_columns = ['Regulatory_Compliance', 'Customer_Category', 'Contract_Status', 'Assistance_Flag', 'Risk_Flag', 'Payment_Status', 'Payment_Interval']

for col in categorical_columns:
    df_1[col] = df_1[col].astype('category')
    df_2[col] = df_2[col].astype('category')

In [47]:
# Dropping Contract_Start_Date and Contract_End_Date
df_1.drop(columns=['Contract_Start_Date', 'Contract_End_Date'], inplace=True)
df_2.drop(columns=['Contract_Start_Date', 'Contract_End_Date'], inplace=True)

In [48]:
# Final check of data types and missing values
print(df_1.info())
print(df_2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   Customer_ID             700000 non-null  object  
 1   Contract_Term           700000 non-null  int64   
 2   Cost_Amount_GBP         700000 non-null  float64 
 3   Regulatory_Compliance   700000 non-null  category
 4   Customer_Category       700000 non-null  category
 5   Exposure_Amount_GBP     700000 non-null  float64 
 6   Contract_Status         700000 non-null  category
 7   Assistance_Flag         700000 non-null  category
 8   Risk_Flag               700000 non-null  category
 9   Payment_Status          700000 non-null  category
 10  Forbearance_Amount_GBP  700000 non-null  float64 
 11  Payment_Interval        700000 non-null  category
 12  Late_Payment_Fees_GBP   700000 non-null  float64 
 13  Total_Arrears_GBP       700000 non-null  float64 
dtypes: c

In [49]:
# Save cleaned datasets
df_1.to_csv('cleaned_historical_arrears_data_1.csv', index=False)
df_2.to_csv('cleaned_historical_arrears_data_2.csv', index=False)