In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [27]:
df=pd.read_csv('shared/complaints_25Nov21.csv')
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2016-10-26,Money transfers,International money transfer,Other transaction issues,,"To whom it concerns, I would like to file a fo...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",,,,Consent provided,Web,2016-10-29,Closed with explanation,Yes,No,2180490
1,2015-03-27,Bank account or service,Other bank product/service,"Account opening, closing, or management",,My name is XXXX XXXX XXXX and huband name is X...,Company chooses not to provide a public response,"CITIBANK, N.A.",PA,151XX,Older American,Consent provided,Web,2015-03-27,Closed with explanation,Yes,No,1305453
2,2015-04-20,Bank account or service,Other bank product/service,"Making/receiving payments, sending money",,XXXX 2015 : I called to make a payment on XXXX...,Company chooses not to provide a public response,U.S. BANCORP,PA,152XX,,Consent provided,Web,2015-04-22,Closed with monetary relief,Yes,No,1337613
3,2013-04-29,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,,,JPMORGAN CHASE & CO.,VA,22406,Servicemember,,Phone,2013-04-30,Closed with explanation,Yes,Yes,393900
4,2013-05-29,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"BANK OF AMERICA, NATIONAL ASSOCIATION",GA,30044,,,Referral,2013-05-31,Closed with explanation,Yes,No,418647


In [28]:
df.describe()

Unnamed: 0,Complaint ID
count,207260.0
mean,1028619.0
std,753334.8
min,22.0
25%,345621.8
50%,920972.0
75%,1710704.0
max,2412707.0


In [29]:
# Assuming your DataFrame is named complaints and it's already loaded
# Select only the specified columns for X
X = df[['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?']]

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Use 'Consumer disputed?' as your y-variable and convert it to 0s and 1s
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['Consumer disputed?'])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 123)

In [36]:
# Assuming y_train is already defined
disputed_proportion = np.mean(y_train)
print(f"Proportion of disputed complaints in the training dataset: {disputed_proportion * 100:.2f}%")

if disputed_proportion < 0.30:
    print("Applying random undersampling to balance the dataset.")
    undersampler = RandomUnderSampler(random_state=123)
    X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
    print(f"After undersampling, new proportion of disputed complaints: {np.mean(y_train_resampled) * 100:.2f}%")
else:
    print("No need for undersampling; proportion of disputed complaints is sufficient.")
    X_train_resampled, y_train_resampled = X_train, y_train


Proportion of disputed complaints in the training dataset: 21.68%
Applying random undersampling to balance the dataset.
After undersampling, new proportion of disputed complaints: 50.00%


In [77]:
# Assuming X_train_resampled, y_train_resampled are your training data after preprocessing and optional resampling
model_xgb = XGBClassifier(random_state=123)
model_xgb = XGBClassifier(use_label_encoder=False, objective= 'binary:logistic')
model_xgb.fit(X_train_resampled, y_train_resampled)

# Predict probabilities for the positive class (Disputed complaints)
y_prob = model_xgb.predict_proba(X_test)[:, 1]

# Convert probabilities to binary predictions using the default threshold
y_pred_thresholded = (y_prob > 0.5).astype(int)

# And then evaluate the predictions
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Calculate the total cost based on the model predictions
total_cost_with_model = (np.sum(y_pred_thresholded) * (cost_disputed + cost_extra_diligence)) + ((len(y_test) - np.sum(y_pred_thresholded)) * cost_nondisputed)

print("Total cost with model predictions:", total_cost_with_model)


Accuracy: 0.5491411753353276
              precision    recall  f1-score   support

           0       0.84      0.53      0.65     32504
           1       0.27      0.63      0.38      8948

    accuracy                           0.55     41452
   macro avg       0.55      0.58      0.51     41452
weighted avg       0.72      0.55      0.59     41452

Total cost with model predictions: 16565290


In [59]:
# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)

# Optionally, to make the confusion matrix more readable:
labels = ['Not Disputed', 'Disputed']  # Assuming 0: 'Not Disputed', 1: 'Disputed'
pd.DataFrame(cm, index=labels, columns=labels)


Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.53      0.65     32504
           1       0.27      0.63      0.38      8948

    accuracy                           0.55     41452
   macro avg       0.55      0.58      0.51     41452
weighted avg       0.72      0.55      0.59     41452


Confusion Matrix:
 [[17108 15396]
 [ 3293  5655]]


Unnamed: 0,Not Disputed,Disputed
Not Disputed,17108,15396
Disputed,3293,5655


In [75]:
# Assuming the cost structure as defined
cost_disputed = 600
cost_extra_diligence = 90
cost_nondisputed = 100

# Predict probabilities for the positive class (Disputed complaints)
y_prob = model_xgb.predict_proba(X_test)[:, 1]

# Convert probabilities to binary predictions using the default threshold
y_pred_thresholded = (y_prob > 0.5).astype(int)

# Calculate the total cost based on the model predictions
total_cost_with_model = (np.sum(y_pred_thresholded) * (cost_disputed + cost_extra_diligence)) + ((len(y_test) - np.sum(y_pred_thresholded)) * cost_nondisputed)

print("Total cost with model predictions:", total_cost_with_model)

Total cost with model predictions: 16565290


In [73]:
# Assuming costs as per the structure
cost_disputed = 600
cost_nondisputed = 100
cost_due_diligence = 90

# Model-based costs
cost_TP = cm[1, 1] * cost_disputed
cost_FP = cm[0, 1] * (cost_nondisputed + cost_due_diligence)
cost_TN = cm[0, 0] * cost_nondisputed
cost_FN = cm[1, 0] * cost_disputed

total_cost_model = cost_TP + cost_FP + cost_TN + cost_FN

# Base-case costs without using a model
total_disputed = cm[1, 0] + cm[1, 1]  # Actual disputed complaints
total_nondisputed = cm[0, 0] + cm[0, 1]  # Actual non-disputed complaints

total_cost_base_case = (total_disputed * cost_disputed) + (total_nondisputed * cost_nondisputed)

print(f"Total Cost with Model: ${total_cost_model}")
print(f"Base-Case Total Cost without Model: ${total_cost_base_case}")

Total Cost with Model: $10004840
Base-Case Total Cost without Model: $8619200


In [62]:
# Count the number of disputed complaints in the test set
num_disputed = np.sum(y_test)

# Calculate the total number of complaints in the test set
total_complaints = len(y_test)

# Calculate the proportion of disputes
proportion_disputes = num_disputed / total_complaints

print("Proportion of consumers who raised a dispute in the test set:", proportion_disputes)


Proportion of consumers who raised a dispute in the test set: 0.21586413200810575


In [63]:
# Count the number of disputed complaints in the training set after random undersampling
num_disputed_train = np.sum(y_train_resampled)

# Calculate the total number of complaints in the training set after random undersampling
total_complaints_train = len(y_train_resampled)

# Calculate the proportion of disputes in the training set
proportion_disputes_train = num_disputed_train / total_complaints_train

print("Proportion of consumers who raised a dispute in the training set after random undersampling:", proportion_disputes_train)


Proportion of consumers who raised a dispute in the training set after random undersampling: 0.5


In [80]:
from sklearn.metrics import classification_report

# Assuming you have trained and fitted the XGBClassifier model and made predictions
y_pred = model_xgb.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Extract the recall for the category 'Consumer disputed?' = 'Yes'
recall_yes = report['1']['recall']

print("Recall for the category 'Consumer disputed?' = 'Yes' on the test set:", recall_yes)


Recall for the category 'Consumer disputed?' = 'Yes' on the test set: 0.6319848010728655


In [84]:
# Assuming the cost structure as defined earlier
cost_disputed = 600
cost_nondisputed = 100

# Count the number of disputed complaints in the test set
num_disputed_test = np.sum(y_test)

# Calculate the total number of complaints in the test set
total_complaints_test = len(y_test)

# Calculate the total cost without using a model
total_cost_no_model = (num_disputed_test * cost_disputed) + ((total_complaints_test - num_disputed_test) * cost_nondisputed)

print("Total cost without using a model:", total_cost_no_model)

Total cost without using a model: 8619200


In [81]:
# Get the predicted probabilities of the positive class
y_pred_proba = model_xgb.predict_proba(X_test)[:, 1]

# Initialize the minimum cost and optimal threshold
min_cost = np.inf
optimal_threshold = 0

# Iterate over a range of possible thresholds
for threshold in np.linspace(0, 1, 100):
    # Convert predicted probabilities to class labels based on the current threshold
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    # Calculate the confusion matrix
    matrix = confusion_matrix(y_test, y_pred)
    
    # Calculate the total cost
    tn, fp, fn, tp = matrix.ravel()
    total_cost = tn*100 + fp*190 + fn*600 + tp*190
    
    # If the current cost is less than the minimum cost, update the minimum cost and optimal threshold
    if total_cost < min_cost:
        min_cost = total_cost
        optimal_threshold = threshold

# Print the optimal threshold and minimum cost
print(f'Optimal threshold: {optimal_threshold}')
print(f'Minimum cost: ${min_cost}')

Optimal threshold: 0.43434343434343436
Minimum cost: $7599970
