In [9]:
# imports
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [10]:
# Loading the training and testing sets for df_1 with error handling and squeezing single-column DataFrames
try:
    X_train_1 = pd.read_csv('X_train_1.csv', on_bad_lines='warn')
    X_test_1 = pd.read_csv('X_test_1.csv', on_bad_lines='warn')
    y_train_1 = pd.read_csv('y_train_1.csv', on_bad_lines='warn').squeeze()
    y_test_1 = pd.read_csv('y_test_1.csv', on_bad_lines='warn').squeeze()

    X_train_2 = pd.read_csv('X_train_2.csv', on_bad_lines='warn')
    X_test_2 = pd.read_csv('X_test_2.csv', on_bad_lines='warn')
    y_train_2 = pd.read_csv('y_train_2.csv', on_bad_lines='warn').squeeze()
    y_test_2 = pd.read_csv('y_test_2.csv', on_bad_lines='warn').squeeze()

    print("Data loaded successfully")
except pd.errors.ParserError as e:
    print("Error reading CSV files:", e)

Data loaded successfully


In [12]:
print(y_train_1.head())
print(y_test_1.head())

0    10777.99
1     5147.89
2     2877.48
3    12727.60
4    18628.53
Name: Total_Arrears_GBP, dtype: float64
0    11572.93
1    17878.57
2     2714.68
3      191.61
4     8127.22
Name: Total_Arrears_GBP, dtype: float64


In [13]:
# Threshold for binarization
threshold_1 = y_train_1.median()
threshold_2 = y_train_2.median()

# Convert to binary categories
y_train_1 = (y_train_1 > threshold_1).astype(int)
y_test_1 = (y_test_1 > threshold_1).astype(int)
y_train_2 = (y_train_2 > threshold_2).astype(int)
y_test_2 = (y_test_2 > threshold_2).astype(int)

In [14]:
# Initialising models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

In [15]:
# Function to train and evaluate models
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"{name} Test Results")
        print(classification_report(y_test, y_pred))
        print("ROC-AUC:", roc_auc_score(y_test, y_pred))
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("="*60)


In [16]:
# Training and evaluating models on df_1
print("Training and evaluating models on df_1")
train_and_evaluate(models, X_train_1, y_train_1, X_test_1, y_test_1)

Training and evaluating models on df_1
Logistic Regression Test Results
              precision    recall  f1-score   support

           0       0.50      0.50      0.50     70032
           1       0.50      0.50      0.50     69968

    accuracy                           0.50    140000
   macro avg       0.50      0.50      0.50    140000
weighted avg       0.50      0.50      0.50    140000

ROC-AUC: 0.49988729467032444
Confusion Matrix:
 [[34766 35266]
 [34750 35218]]
Random Forest Test Results
              precision    recall  f1-score   support

           0       0.50      0.53      0.51     70032
           1       0.50      0.47      0.48     69968

    accuracy                           0.50    140000
   macro avg       0.50      0.50      0.50    140000
weighted avg       0.50      0.50      0.50    140000

ROC-AUC: 0.4984633580462201
Confusion Matrix:
 [[37239 32793]
 [37420 32548]]
Gradient Boosting Test Results
              precision    recall  f1-score   support

    

In [17]:
# Training and evaluating models on df_2
print("Training and evaluating models on df_2")
train_and_evaluate(models, X_train_2, y_train_2, X_test_2, y_test_2)

Training and evaluating models on df_2
Logistic Regression Test Results


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.53      1.00      0.69     69983
           1       0.00      0.00      0.00     63164

    accuracy                           0.53    133147
   macro avg       0.26      0.50      0.34    133147
weighted avg       0.28      0.53      0.36    133147

ROC-AUC: 0.5
Confusion Matrix:
 [[69983     0]
 [63164     0]]
Random Forest Test Results
              precision    recall  f1-score   support

           0       0.53      0.67      0.59     69983
           1       0.47      0.33      0.39     63164

    accuracy                           0.51    133147
   macro avg       0.50      0.50      0.49    133147
weighted avg       0.50      0.51      0.49    133147

ROC-AUC: 0.49983909340773497
Confusion Matrix:
 [[46905 23078]
 [42355 20809]]
Gradient Boosting Test Results
              precision    recall  f1-score   support

           0       0.53      0.99      0.69     69983
           1       0.45      0.00    