In [3]:
# import
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [4]:
df = pd.read_csv('complaints_25Nov21.csv')
df

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2016-10-26,Money transfers,International money transfer,Other transaction issues,,"To whom it concerns, I would like to file a fo...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",,,,Consent provided,Web,2016-10-29,Closed with explanation,Yes,No,2180490
1,2015-03-27,Bank account or service,Other bank product/service,"Account opening, closing, or management",,My name is XXXX XXXX XXXX and huband name is X...,Company chooses not to provide a public response,"CITIBANK, N.A.",PA,151XX,Older American,Consent provided,Web,2015-03-27,Closed with explanation,Yes,No,1305453
2,2015-04-20,Bank account or service,Other bank product/service,"Making/receiving payments, sending money",,XXXX 2015 : I called to make a payment on XXXX...,Company chooses not to provide a public response,U.S. BANCORP,PA,152XX,,Consent provided,Web,2015-04-22,Closed with monetary relief,Yes,No,1337613
3,2013-04-29,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,,,JPMORGAN CHASE & CO.,VA,22406,Servicemember,,Phone,2013-04-30,Closed with explanation,Yes,Yes,393900
4,2013-05-29,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"BANK OF AMERICA, NATIONAL ASSOCIATION",GA,30044,,,Referral,2013-05-31,Closed with explanation,Yes,No,418647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207255,2015-05-24,Debt collection,Credit card,Taking/threatening an illegal action,Sued w/o proper notification of suit,,,JPMORGAN CHASE & CO.,FL,33133,,Consent not provided,Web,2015-05-24,Closed with explanation,Yes,No,1390395
207256,2012-01-10,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,,,JPMORGAN CHASE & CO.,NY,10312,,,Referral,2012-01-11,Closed without relief,Yes,Yes,12192
207257,2012-07-17,Student loan,Non-federal student loan,Repaying your loan,,,,"BANK OF AMERICA, NATIONAL ASSOCIATION",NH,032XX,,,Web,2012-07-18,Closed with explanation,Yes,No,118351
207258,2016-09-29,Bank account or service,Checking account,"Account opening, closing, or management",,Near the end of XXXX 2016 I opened a Citigold ...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",CA,900XX,,Consent provided,Web,2016-09-29,Closed with non-monetary relief,Yes,No,2138969


In [11]:
# Train the model
model_xgb = XGBClassifier(random_state=123)
model_xgb.fit(X_train, y_train)

In [5]:
# Select predictor variables
X = df[['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?']]

# Convert categorical variables to numerical
X = pd.get_dummies(X)

# Set target variable
le = LabelEncoder()
y = le.fit_transform(df['Consumer disputed?'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


In [6]:
# Calculate the proportion of consumers who raised a dispute in the test set
dispute_proportion = y_test.sum() / len(y_test)
print(dispute_proportion)

0.21586413200810575


In [8]:
# Perform random undersampling
undersampler = RandomUnderSampler(random_state=123)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Calculate the proportion of consumers who raised a dispute in the resampled training set
dispute_proportion_resampled = y_train_resampled.sum() / len(y_train_resampled)
print(dispute_proportion_resampled)

0.5


In [17]:
# Fit the XGBClassifier model
model_xgb = XGBClassifier(random_state = 123)
model_xgb.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = model_xgb.predict(X_test)
matrix = confusion_matrix(y_test, y_pred)
# Create the classification report
report = classification_report(y_test, y_pred)
print(report)
print(matrix)

              precision    recall  f1-score   support

           0       0.84      0.53      0.65     32504
           1       0.27      0.63      0.38      8948

    accuracy                           0.55     41452
   macro avg       0.55      0.58      0.51     41452
weighted avg       0.71      0.55      0.59     41452

[[17157 15347]
 [ 3323  5625]]


In [18]:
# Calculate the number of disputed and not disputed complaints
num_disputed = y_test.sum()
num_non_disputed = len(y_test) - num_disputed

# Calculate the total cost
total_cost = num_non_disputed * 100 + num_disputed * 600
print(total_cost)

8619200


In [20]:
# Calculate the total cost
tn, fp, fn, tp = matrix.ravel()
total_cost = tn * 100 + fp * 190 + fn * 600 + tp * 190
print(total_cost)

7694180


In [22]:
# Get the predicted probabilities of the positive class
y_pred_proba = model_xgb.predict_proba(X_test)[:, 1]

min_cost = np.inf
optimal_threshold = 0

# Iterate over a range of possible thresholds
for threshold in np.linspace(0, 1, 100):
    # Convert predicted probabilities to class labels based on the current threshold
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    # Calculate the confusion matrix
    matrix = confusion_matrix(y_test, y_pred)
    
    # Calculate the total cost
    tn, fp, fn, tp = matrix.ravel()
    total_cost = tn * 100 + fp * 190 + fn * 600 + tp * 190
    
    # If the current cost is less than the minimum cost, update the minimum cost and optimal threshold
    if total_cost < min_cost:
        min_cost = total_cost
        optimal_threshold = threshold

# Print the optimal threshold and minimum cost
print(optimal_threshold)
print(min_cost)


0.4040404040404041
7610240
