In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('creditcardmarketing-bbm.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
duplicate_rows = df[df.duplicated()]
len(duplicate_rows)

In [None]:
df['Offer Accepted'].value_counts()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}':")
    print(unique_values)
    print()

In [None]:
df.dtypes

In [None]:
for column in df.columns:
    value_counts = df[column].value_counts()
    print(f"Value counts in column '{column}':")
    print(value_counts)
    print()

In [None]:
def categorize_size(size):
    if size in [1, 2, 3, 4]:
        return "Small"
    else:
        return "Big"
df['Household Size'] = df['Household Size'].apply(categorize_size)

In [None]:
df = df.drop('index',axis=1)

In [None]:
df['Offer Accepted'] = pd.Categorical(df['Offer Accepted'], categories=['No', 'Yes'])
df['Reward'] = pd.Categorical(df['Reward'], categories=['Air Miles', 'Cash Back' ,'Points'])
df['Mailer Type'] = pd.Categorical(df['Mailer Type'], categories=['Letter' ,'Postcard'])
df['Overdraft Protection'] = pd.Categorical(df['Overdraft Protection'], categories=['No' ,'Yes'])
df['Credit Rating'] = pd.Categorical(df['Credit Rating'], categories=['Low', 'Medium', 'High'],ordered=True)
df['Household Size'] = pd.Categorical(df['Household Size'], categories=['Small', 'Big'],ordered=True)
df['Own Your Home'] = pd.Categorical(df['Own Your Home'], categories=['No', 'Yes'])
df['Income Level'] = pd.Categorical(df['Income Level'], categories=['Low', 'Medium', 'High'],ordered=True)

In [None]:
from sklearn.model_selection import train_test_split
trainset, testset = train_test_split(df, test_size=0.2, random_state=100)

print("Training set shape:", trainset.shape)
print("Test set shape:",testset.shape)

In [None]:
trainset['Offer Accepted'].value_counts()

In [None]:
testset['Offer Accepted'].value_counts()

In [None]:
trainset.dtypes

In [None]:
for column in trainset.columns:
    value_counts = trainset[column].value_counts()
    print(f"Value counts in column '{column}':")
    print(value_counts)
    print()

In [None]:
for column in testset.columns:
    value_counts = testset[column].value_counts()
    print(f"Value counts in column '{column}':")
    print(value_counts)
    print()

In [None]:
trainset.isna().sum()

In [None]:
testset.isna().sum()

In [None]:
#Imputing missing values
Average_Balance_mean=trainset['Average Balance'].mean()
Q1_Balance_mean=trainset['Q1 Balance'].mean()
Q2_Balance_mean=trainset['Q2 Balance'].mean()
Q3_Balance_mean=trainset['Q3 Balance'].mean()
Q4_Balance_mean=trainset['Q4 Balance'].mean()

trainset['Average Balance'].fillna(Average_Balance_mean, inplace=True)
trainset['Q1 Balance'].fillna(Q1_Balance_mean, inplace=True)
trainset['Q2 Balance'].fillna(Q2_Balance_mean, inplace=True)
trainset['Q3 Balance'].fillna(Q3_Balance_mean, inplace=True)
trainset['Q4 Balance'].fillna(Q4_Balance_mean, inplace=True)

testset['Average Balance'].fillna(Average_Balance_mean, inplace=True)
testset['Q1 Balance'].fillna(Q1_Balance_mean, inplace=True)
testset['Q2 Balance'].fillna(Q2_Balance_mean, inplace=True)
testset['Q3 Balance'].fillna(Q3_Balance_mean, inplace=True)
testset['Q4 Balance'].fillna(Q4_Balance_mean, inplace=True)

In [None]:
trainset.isna().sum()

## Descriptive Analysis using the training set

In [None]:
target =trainset['Offer Accepted'].value_counts()
fig1, ax1 = plt.subplots()
ax1.pie(target, labels = target.index, autopct = '%1.1f%%', shadow = False, explode = [0.1, 0])
ax1.axis('equal')
plt.show()
print('Total number of customers in the training set:', trainset['Offer Accepted'].count())
print(trainset['Offer Accepted'].value_counts())

In [None]:
trainset.columns

In [None]:
ax = sns.countplot(x='Reward', hue='Offer Accepted', data=trainset)


total_counts = trainset['Reward'].value_counts()
for container in ax.containers:
    total = sum([bar.get_height() for bar in container])
    for bar in container:
        percentage = f'{bar.get_height() / total * 100:.2f}%'
        ax.annotate(percentage,
                    xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=10, color='black')

sns.set(rc={'figure.figsize': (15, 8)})
plt.title('Reward vs Offer Acceptance')
plt.show()

In [None]:
categorical_col=['# Bank Accounts Open','# Credit Cards Held', '# Homes Owned','Reward', 'Mailer Type','Income Level','Overdraft Protection','Credit Rating',
                 'Household Size', 'Own Your Home']
for column in categorical_col:
    ax = sns.countplot(x=column, hue='Offer Accepted', data=trainset)


    total_counts = trainset[column].value_counts()
    for container in ax.containers:
        total = sum([bar.get_height() for bar in container])
        for bar in container:
            percentage = f'{bar.get_height() / total * 100:.2f}%'
            ax.annotate(percentage,
                    xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                    xytext=(0, 3), 
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=10, color='black')

    sns.set(rc={'figure.figsize': (15, 8)})
    
    plt.show()

In [None]:
col = ['Reward', 'Mailer Type', 'Income Level', 'Overdraft Protection', 'Credit Rating', 'Household Size', 'Own Your Home']


sns.set(rc={'figure.figsize': (15, 8)})

for column in col:
    ax = sns.countplot(x=column, hue='Offer Accepted', data=trainset)

    for container in ax.containers:
        total_heights = [bar.get_height() for bar in container]
        yes_total = total_heights[1] if len(total_heights) > 1 else 0  
        no_total = total_heights[0] if len(total_heights) > 0 else 0 

        for i, bar in enumerate(container):
            total = yes_total if i % 2 == 1 else no_total  
            ax.annotate(total,
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                xytext=(0, 3),  
                textcoords="offset points",
                ha='center', va='bottom', fontsize=10, color='black')

    plt.title(f'{column} vs Offer Acceptance')
    plt.show()

In [None]:
trainset.columns

In [None]:
numerical_col=['Average Balance', 'Q1 Balance',
       'Q2 Balance', 'Q3 Balance', 'Q4 Balance']
for column in numerical_col:
    ax = sns.boxplot(x='Offer Accepted', y=column, data=trainset, palette=["red", "blue"], width=0.4)
    plt.show()

In [None]:
numerical_cols = ['# Bank Accounts Open', '# Credit Cards Held', '# Homes Owned']
response_categories = ['No', 'Yes']

sns.set(rc={'figure.figsize': (15, 8)})

for column in numerical_cols:
    ax = sns.countplot(x=column, hue='Offer Accepted', data=trainset, hue_order=response_categories)

    for container in ax.containers:
        for i, bar in enumerate(container):
            total = bar.get_height()
            ax.annotate(total,
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                xytext=(0, 3), 
                textcoords="offset points",
                ha='center', va='bottom', fontsize=10, color='black')

    plt.title(f'{column} vs Offer Acceptance')
    plt.show()

In [None]:
columns_to_plot = ['# Bank Accounts Open', '# Credit Cards Held', '# Homes Owned', 'Average Balance', 'Q1 Balance', 'Q2 Balance', 'Q3 Balance', 'Q4 Balance']

sns.pairplot(trainset[columns_to_plot])
plt.show()

### MCA

In [None]:
from prince import MCA

In [None]:
mca_cols = trainset.select_dtypes(['category']).columns
print(len(mca_cols), 'features used for MCA are', mca_cols.tolist())

In [None]:
df_encoded = pd.get_dummies(df, mca_cols)


In [None]:
df_encoded = df_encoded.drop(columns=['Customer Number'])

In [None]:

mca = MCA()
mca_data=trainset[mca_cols]
mca_data.head()

mca1 = mca.fit(mca_data)

In [None]:
mca.eigenvalues_summary

In [None]:
row_coordinates=mca.row_coordinates(mca_data)
row_coordinates

In [None]:
column_coordinates=mca.column_coordinates(mca_data)
column_coordinates

In [None]:

plt.figure(figsize=(10, 6))
plt.scatter(column_coordinates[0], column_coordinates[1], marker='o', s=10, color='red')


for label, x, y in zip(column_coordinates.index, column_coordinates[0], column_coordinates[1]):
    plt.text(x, y, label, fontsize=5, ha='right', va='bottom')

classes_to_highlight = ['Offer Accepted_No', 'Offer Accepted_Yes']
class_coordinates_to_highlight = column_coordinates.loc[classes_to_highlight]
plt.scatter(class_coordinates_to_highlight[0], class_coordinates_to_highlight[1], marker='o', s=10, color='blue', label='Offer Accepted')

for label, x, y in zip(classes_to_highlight, class_coordinates_to_highlight[0], class_coordinates_to_highlight[1]):
    plt.text(x, y, label, fontsize=5, ha='right', va='bottom')

plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.legend()
plt.show()

## PLS - DA

In [None]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
X_train = trainset.drop('Offer Accepted', axis=1)
y_train = trainset['Offer Accepted']
X_test = testset.drop('Offer Accepted', axis=1)
y_test = testset['Offer Accepted']

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X_train_encoded = pd.get_dummies(X_train)
y_train_encoded = label_encoder.fit_transform(y_train)
X_test_encoded = pd.get_dummies(X_test)
y_test_encoded = label_encoder.fit_transform(y_test)

In [None]:
n_components = 2  
plsda = PLSRegression(n_components=n_components)

In [None]:

plsda.fit(X_train_encoded, y_train_encoded)

X_train_plsda = plsda.transform(X_train_encoded)
X_test_plsda = plsda.transform(X_test_encoded)

In [None]:
scores = plsda.fit(X_train_encoded, y_train_encoded).x_scores_[:, :2]

total_variance = np.var(X_train_encoded, axis=0).sum()

scores_variance = np.var(scores, axis=0).sum()
scores_variance

variance_explained = scores_variance / total_variance

print(f"Variance explained by the first two components: {variance_explained * 100}%")

In [None]:
import matplotlib.pyplot as plt

unique_classes = np.unique(y_train_encoded)

plt.figure(figsize=(8, 6))
for cls in unique_classes:
    mask = (y_train_encoded == cls)
    plt.scatter(X_train_plsda[mask, 0], X_train_plsda[mask, 1], label=f'Class {cls}')

plt.title('PLS-DA Score Plot')
plt.xlabel('PLS-DA Component 1')
plt.ylabel('PLS-DA Component 2')
plt.legend()
plt.grid(True)
plt.show()

## Correlation

In [None]:
corr_matrix = trainset.corr(method='pearson')
print(corr_matrix)

In [None]:
from scipy.stats import spearmanr

In [None]:
trainset.corr(numeric_only=True, method='spearman')

In [None]:
plt.figure(figsize=(25,10))
sns.heatmap(trainset.corr(numeric_only=True), annot=True, cmap='Blues')

# Advanced Analysis

In [None]:
trainset['Offer Accepted'] = trainset['Offer Accepted'].replace({'No': 0, 'Yes': 1})
testset['Offer Accepted'] = testset['Offer Accepted'].replace({'No': 0, 'Yes': 1})

In [None]:
trainset.head()

In [None]:
X_train = trainset.drop('Offer Accepted', axis=1)
y_train = trainset['Offer Accepted']
X_test = testset.drop('Offer Accepted', axis=1)
y_test = testset['Offer Accepted']

In [None]:
ordinal_mapping = {
    'Credit Rating': ['Low', 'Medium', 'High'],
    'Household Size': ['Small', 'Big'],
    'Income Level': ['Low', 'Medium', 'High']
}

for column, categories in ordinal_mapping.items():
    X_train[column] = pd.Categorical(X_train[column], categories=categories, ordered=True).codes
    X_test[column] = pd.Categorical(X_test[column], categories=categories, ordered=True).codes

In [None]:
nominal_columns = ['Reward', 'Mailer Type', 'Income Level', 'Overdraft Protection', 'Credit Rating', 'Household Size', 'Own Your Home']

X_train = pd.get_dummies(X_train, columns=nominal_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=nominal_columns, drop_first=True)

In [None]:
X_train.head()

In [None]:
X_train = X_train.drop(columns=['Customer Number'])
X_test = X_test.drop(columns=['Customer Number'])

In [None]:
y_train.head()

In [None]:
trainset.to_csv('trainset.csv', index=False)
testset.to_csv('testset.csv', index=False)
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)

In [None]:
X_train.dtypes

### Model fitting

In [None]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['# Bank Accounts Open', '# Credit Cards Held', '# Homes Owned',
                     'Average Balance', 'Q1 Balance', 'Q2 Balance', 'Q3 Balance', 'Q4 Balance']


scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[numerical_columns] = scaler.fit_transform(X_train_scaled[numerical_columns])

X_test_scaled = X_test.copy()  
X_test_scaled[numerical_columns] = scaler.transform(X_test_scaled[numerical_columns])

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib

random_state = 42
classifiers = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Logistic Ridge', LogisticRegression(penalty='l2', max_iter=1000)),
    ('Logistic Lasso', LogisticRegression(penalty='l1',solver='liblinear')),
    ('KNN', KNeighborsClassifier()),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('SVM Linear', SVC(kernel='linear', probability=True, random_state=42)),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42))
]


results = {}
classification_reports = {}
trained_models = {}

for name, clf in classifiers:

    clf.fit(X_train_scaled, y_train)

    trained_models[name] = clf 

    y_pred = clf.predict(X_train_scaled)
    classification_report_text = classification_report(y_train, y_pred)
    accuracy = accuracy_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred)
    recall = recall_score(y_train, y_pred)
    
    classification_reports[name] = classification_report_text
    results[name] = {'Accuracy': accuracy, 'F1-Score': f1, 'Precision': precision, 'Recall': recall}

import pandas as pd
results_df = pd.DataFrame(results)
print(results_df)

for name, report in classification_reports.items():
    print(f"Classification Report for {name}:\n{report}")
    
for name, model in trained_models.items():
    filename = f"{name}_model.joblib"
    joblib.dump(model, filename)

In [None]:
results = {}
classification_reports = {}

for name, clf in trained_models.items():
    y_pred = clf.predict(X_test_scaled) 
    classification_report_text = classification_report(y_test, y_pred) 
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    classification_reports[name] = classification_report_text
    results[name] = {'Accuracy': accuracy, 'F1-Score': f1, 'Precision': precision, 'Recall': recall}

results_df = pd.DataFrame(results)
print(results_df)

for name, report in classification_reports.items():
    print(f"Classification Report for {name}:\n{report}")

In [None]:
from sklearn.metrics import confusion_matrix
classes = ['Not accepted', 'Accepted']

confusion_matrices = {}

for name, clf in trained_models.items():
    y_pred = clf.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices[name] = cm

for name, cm in confusion_matrices.items():
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix - {name}')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.show()

## Applying  balancing techniques

In [None]:
import numpy as np
from sklearn.metrics import classification_report, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
import pandas as pd
import joblib

resampling_methods = {
    'SMOTE': SMOTE(random_state=42),
    'RandomUnderSampler': RandomUnderSampler(random_state=42)
}

classifiers = [
    ('Logistic Regression', LogisticRegression()),
    ('Logistic Ridge', LogisticRegression(penalty='l2', max_iter=1000)),
    ('Logistic Lasso', LogisticRegression(penalty='l1', solver='liblinear')),
    ('KNN', KNeighborsClassifier()),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
    ('SVM Linear', SVC(kernel='linear', probability=True, random_state=42))
]

results = {}
modified_trained_models = {}
classification_reports = {}

for resampling_name, resampling_method in resampling_methods.items():
    print(f"Applying {resampling_name}...")
    
    X_resampled, y_resampled = resampling_method.fit_resample(X_train_scaled, y_train)
    
    for name, clf in classifiers:
        clf.fit(X_resampled, y_resampled)
        modified_trained_models[(resampling_name, name)] = clf 

        y_probs = clf.predict_proba(X_train_scaled)[:, 1]

        precision, recall, thresholds = precision_recall_curve(y_train, y_probs)

        pr_auc = auc(recall, precision)
        
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
        optimal_threshold = thresholds[np.argmax(f1_scores)]
        print(f"Optimal Threshold for {name}: {optimal_threshold}")
        
        y_pred_optimal = (y_probs >= optimal_threshold).astype(int)

        report = classification_report(y_train, y_pred_optimal)
        classification_reports[(resampling_name, name)] = report

for (resampling_name, model_name), model in modified_trained_models.items():
    filename = f"{resampling_name.lower()}_{model_name}_model.joblib"
    joblib.dump(model, filename)

for (resampling_name, model_name), report in classification_reports.items():
    print(f"\nClassification Report for {resampling_name} - {model_name}:\n{report}")


In [None]:
import joblib
test_classification_reports = {}

for (resampling_name, model_name), clf in modified_trained_models.items():
    
    
    model_filename = f"{resampling_name.lower()}_{model_name}_model.joblib"
    trained_model = joblib.load(model_filename)

    
    y_probs_test = trained_model.predict_proba(X_test_scaled)[:, 1]
    y_pred_optimal_test = (y_probs_test >= optimal_threshold).astype(int)

    
    test_report = classification_report(y_test, y_pred_optimal_test)
    test_classification_reports[(resampling_name, model_name)] = test_report


for (resampling_name, model_name), test_report in test_classification_reports.items():
    print(f"\nClassification Report for {resampling_name} - {model_name} (on test set):\n{test_report}")


In [None]:
rf_model_smote = joblib.load('smote_Random Forest_model.joblib')
rf_model_us = joblib.load('randomundersampler_Random Forest_model.joblib')
feature_names = X_train.columns

feature_importances_smote = rf_model_smote.feature_importances_
importance_df_smote = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances_smote})
importance_df_smote = importance_df_smote.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance_df_smote['Feature'], importance_df_smote['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Variable Importance - SMOTE Random Forest')
plt.show()

feature_importances_us = rf_model_us.feature_importances_
importance_df_us = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances_us})
importance_df_us = importance_df_us.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance_df_us['Feature'], importance_df_us['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Variable Importance - Under Sampled Random Forest')
plt.show()

## Hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled, y_train)

rf_classifier_smote = RandomForestClassifier(random_state=42)
grid_search_smote = GridSearchCV(rf_classifier_smote, param_grid, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_smote.fit(X_resampled_smote, y_resampled_smote)

best_model_smote = grid_search_smote.best_estimator_

print("Best parameters for SMOTE model:", grid_search_smote.best_params_)

y_pred_smote = best_model_smote.predict(X_test_scaled)

report_smote = classification_report(y_test, y_pred_smote)
print("Classification Report for the tuned SMOTE model:")
print(report_smote)


undersampler = RandomUnderSampler(random_state=42)
X_resampled_undersampler, y_resampled_undersampler = undersampler.fit_resample(X_train_scaled, y_train)

rf_classifier_undersampler = RandomForestClassifier(random_state=42)
grid_search_undersampler = GridSearchCV(rf_classifier_undersampler, param_grid, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_undersampler.fit(X_resampled_undersampler, y_resampled_undersampler)

best_model_undersampler = grid_search_undersampler.best_estimator_


print("Best parameters for Random Undersampler model:", grid_search_undersampler.best_params_)

y_pred_undersampler = best_model_undersampler.predict(X_test_scaled)

report_undersampler = classification_report(y_test, y_pred_undersampler)
print("Classification Report for the tuned Random Undersampler model:")
print(report_undersampler)

In [None]:
y_pred_smote = best_model_smote.predict(X_train_scaled)

report_smote = classification_report(y_train, y_pred_smote)
print("Classification Report for the tuned SMOTE model:")
print(report_smote)

y_pred_undersampler = best_model_undersampler.predict(X_train_scaled)
report_undersampler = classification_report(y_train, y_pred_undersampler)
print("Classification Report for the tuned Random Undersampler model:")
print(report_undersampler)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'min_child_weight': [1, 3, 5],
}

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled, y_train)

xgb_classifier_smote = XGBClassifier(random_state=42)
grid_search_smote_xgb = GridSearchCV(xgb_classifier_smote, param_grid_xgb, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_smote_xgb.fit(X_resampled_smote, y_resampled_smote)

best_model_smote_xgb = grid_search_smote_xgb.best_estimator_

print("Best parameters for SMOTE XGBoost model:", grid_search_smote_xgb.best_params_)

y_pred_smote_xgb = best_model_smote_xgb.predict(X_test_scaled)

report_smote_xgb = classification_report(y_test, y_pred_smote_xgb)
print("Classification Report for the tuned SMOTE XGBoost model:")
print(report_smote_xgb)

undersampler = RandomUnderSampler(random_state=42)
X_resampled_undersampler, y_resampled_undersampler = undersampler.fit_resample(X_train_scaled, y_train)

xgb_classifier_undersampler = XGBClassifier(random_state=42)
grid_search_undersampler_xgb = GridSearchCV(xgb_classifier_undersampler, param_grid_xgb, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_undersampler_xgb.fit(X_resampled_undersampler, y_resampled_undersampler)

best_model_undersampler_xgb = grid_search_undersampler_xgb.best_estimator_

print("Best parameters for Random Undersampler XGBoost model:", grid_search_undersampler_xgb.best_params_)

y_pred_undersampler_xgb = best_model_undersampler_xgb.predict(X_test_scaled)

report_undersampler_xgb = classification_report(y_test, y_pred_undersampler_xgb)
print("Classification Report for the tuned Random Undersampler XGBoost model:")
print(report_undersampler_xgb)

In [None]:
y_pred_smote = best_model_smote_xgb.predict(X_train_scaled)

report_smote = classification_report(y_train, y_pred_smote)
print("Classification Report for the tuned SMOTE model:")
print(report_smote)


y_pred_undersampler = best_model_undersampler_xgb.predict(X_train_scaled)

report_undersampler = classification_report(y_train, y_pred_undersampler)
print("Classification Report for the tuned Random Undersampler model:")
print(report_undersampler)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def generate_color_gradient(importances):
    norm_importances = (importances - min(importances)) / (max(importances) - min(importances))
    
    # Define a custom colormap starting from light blue and degrading towards dark blue
    cmap = LinearSegmentedColormap.from_list('custom_gradient', ['#99CCFF', '#66B2FF', '#0089FF', '#0071D7', '#0058A9', '#00407A', '#00274C'])

    colors = cmap(norm_importances)
    return colors

feature_importances_smote = best_model_smote.feature_importances_
sorted_idx_smote = feature_importances_smote.argsort()

colors_smote = generate_color_gradient(feature_importances_smote)

plt.figure(figsize=(10, 6))
plt.barh(range(X_train_scaled.shape[1]), feature_importances_smote[sorted_idx_smote], color=colors_smote[sorted_idx_smote])
plt.yticks(range(X_train_scaled.shape[1]), X_train.columns[sorted_idx_smote])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Variable Importance Plot for SMOTE-Random Forest model')
plt.show()


feature_importances_undersampler = best_model_undersampler.feature_importances_
sorted_idx_undersampler = feature_importances_undersampler.argsort()

colors_undersampler = generate_color_gradient(feature_importances_undersampler)

plt.figure(figsize=(10, 6))
plt.barh(range(X_train_scaled.shape[1]), feature_importances_undersampler[sorted_idx_undersampler], color=colors_undersampler[sorted_idx_undersampler])
plt.yticks(range(X_train_scaled.shape[1]), X_train.columns[sorted_idx_undersampler])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Variable Importance Plot for Random Undersampler-Random Forest model')
plt.show()


feature_importances_smote = best_model_smote_xgb.feature_importances_
sorted_idx_smote = feature_importances_smote.argsort()

colors_smote = generate_color_gradient(feature_importances_smote)

plt.figure(figsize=(10, 6))
plt.barh(range(X_train_scaled.shape[1]), feature_importances_smote[sorted_idx_smote], color=colors_smote[sorted_idx_smote])
plt.yticks(range(X_train_scaled.shape[1]), X_train.columns[sorted_idx_smote])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Variable Importance Plot for SMOTE-XGBoost model')
plt.show()


feature_importances_undersampler = best_model_undersampler_xgb.feature_importances_
sorted_idx_undersampler = feature_importances_undersampler.argsort()

colors_undersampler = generate_color_gradient(feature_importances_undersampler)

plt.figure(figsize=(10, 6))
plt.barh(range(X_train_scaled.shape[1]), feature_importances_undersampler[sorted_idx_undersampler], color=colors_undersampler[sorted_idx_undersampler])
plt.yticks(range(X_train_scaled.shape[1]), X_train.columns[sorted_idx_undersampler])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Variable Importance Plot for Random Undersampler-XGBoost model')
plt.show()

## Voting Classifier 

In [None]:
from sklearn.ensemble import VotingClassifier

xgb_smote_classifier = best_model_smote_xgb
rf_smote_classifier = best_model_smote

ensemble_classifier = VotingClassifier(
    estimators=[
        ('xgb_smote', xgb_smote_classifier),
        ('rf_smote', rf_smote_classifier)
    ],
    voting='soft' 
)

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled, y_train)
ensemble_classifier.fit(X_resampled_smote, y_resampled_smote)


y_pred_train = ensemble_classifier.predict(X_train_scaled)


report_ensemble = classification_report(y_train, y_pred_train)
print("Classification Report for the Ensemble Classifier:")
print(report_ensemble)


y_pred_test = ensemble_classifier.predict(X_test_scaled)

report_ensemble = classification_report(y_test, y_pred_test)
print("Classification Report for the Ensemble Classifier:")
print(report_ensemble)

# Reduced Models

In [None]:
X_train.columns

In [None]:
selected_features = [
    'Q1 Balance', 'Q2 Balance', 'Q3 Balance','Q4 Balance', 
    'Average Balance', 'Credit Rating_2', 'Mailer Type_Postcard',
    'Reward_Cash Back','Credit Rating_1',  '# Credit Cards Held']

X_train_scaled_selected = X_train_scaled[selected_features]
X_test_scaled_selected = X_test_scaled[selected_features]

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib

random_state = 42
classifiers = [
    ('reduced_Logistic Regression', LogisticRegression(max_iter=1000)),
    ('reduced_Logistic Ridge', LogisticRegression(penalty='l2', max_iter=1000)),
    ('reduced_Logistic Lasso', LogisticRegression(penalty='l1',solver='liblinear')),
    ('reduced_KNN', KNeighborsClassifier()),
    ('reduced_Gaussian Naive Bayes', GaussianNB()),
    ('reduced_SVM Linear', SVC(kernel='linear', probability=True, random_state=42)),
    ('reduced_Random Forest', RandomForestClassifier(random_state=42)),
    ('reduced_XGBoost', XGBClassifier(random_state=42))
]


results = {}
classification_reports = {}
reduced_trained_models = {}

for name, clf in classifiers:

    clf.fit(X_train_scaled_selected, y_train)

    reduced_trained_models[name] = clf 

    y_pred = clf.predict(X_train_scaled_selected)
    classification_report_text = classification_report(y_train, y_pred)
    accuracy = accuracy_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred)
    precision = precision_score(y_train, y_pred)
    recall = recall_score(y_train, y_pred)
    
    classification_reports[name] = classification_report_text
    results[name] = {'Accuracy': accuracy, 'F1-Score': f1, 'Precision': precision, 'Recall': recall}

import pandas as pd
results_df = pd.DataFrame(results)
print(results_df)

for name, report in classification_reports.items():
    print(f"Classification Report for {name}:\n{report}")
    
for name, model in reduced_trained_models.items():
    filename = f"{name}_model.joblib"
    joblib.dump(model, filename)

In [None]:
results = {}
classification_reports = {}

for name, clf in reduced_trained_models.items():
    y_pred = clf.predict(X_test_scaled_selected) 
    classification_report_text = classification_report(y_test, y_pred) 
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    classification_reports[name] = classification_report_text
    results[name] = {'Accuracy': accuracy, 'F1-Score': f1, 'Precision': precision, 'Recall': recall}

results_df = pd.DataFrame(results)
print(results_df)

for name, report in classification_reports.items():
    print(f"Classification Report for {name}:\n{report}")

In [None]:
import numpy as np
from sklearn.metrics import classification_report, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
import pandas as pd
import joblib

resampling_methods = {
    'SMOTE': SMOTE(random_state=42),
    'RandomUnderSampler': RandomUnderSampler(random_state=42)
}

classifiers = [
    ('reduced_Logistic Regression', LogisticRegression(max_iter=1000)),
    ('reduced_Logistic Ridge', LogisticRegression(penalty='l2', max_iter=1000)),
    ('reduced_Logistic Lasso', LogisticRegression(penalty='l1',solver='liblinear')),
    ('reduced_KNN', KNeighborsClassifier()),
    ('reduced_Gaussian Naive Bayes', GaussianNB()),
    ('reduced_SVM Linear', SVC(kernel='linear', probability=True, random_state=42)),
    ('reduced_Random Forest', RandomForestClassifier(random_state=42)),
    ('reduced_XGBoost', XGBClassifier(random_state=42))
]

results = {}
modified_reduced_trained_models = {}
classification_reports = {}

for resampling_name, resampling_method in resampling_methods.items():
    print(f"Applying {resampling_name}...")
    
    X_resampled, y_resampled = resampling_method.fit_resample(X_train_scaled_selected, y_train)
    
    for name, clf in classifiers:
        clf.fit(X_resampled, y_resampled)
        modified_reduced_trained_models[(resampling_name, name)] = clf 

     
        y_probs = clf.predict_proba(X_train_scaled_selected)[:, 1]


        precision, recall, thresholds = precision_recall_curve(y_train, y_probs)


        pr_auc = auc(recall, precision)
        

        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
        optimal_threshold = thresholds[np.argmax(f1_scores)]
        print(f"Optimal Threshold for {name}: {optimal_threshold}")
        

        y_pred_optimal = (y_probs >= optimal_threshold).astype(int)


        report = classification_report(y_train, y_pred_optimal)
        classification_reports[(resampling_name, name)] = report

for (resampling_name, model_name), model in modified_reduced_trained_models.items():
    filename = f"{resampling_name.lower()}_{model_name}_model.joblib"
    joblib.dump(model, filename)


for (resampling_name, model_name), report in classification_reports.items():
    print(f"\nClassification Report for {resampling_name} - {model_name}:\n{report}")

In [None]:
import joblib
test_classification_reports = {}

for (resampling_name, model_name), clf in modified_reduced_trained_models.items():
    
    
    model_filename = f"{resampling_name.lower()}_{model_name}_model.joblib"
    trained_model = joblib.load(model_filename)

    y_probs_test = trained_model.predict_proba(X_test_scaled_selected)[:, 1]
    y_pred_optimal_test = (y_probs_test >= optimal_threshold).astype(int)

    test_report = classification_report(y_test, y_pred_optimal_test)
    test_classification_reports[(resampling_name, model_name)] = test_report


for (resampling_name, model_name), test_report in test_classification_reports.items():
    print(f"\nClassification Report for {resampling_name} - {model_name} (on test set):\n{test_report}")


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled_selected, y_train)

rf_classifier_smote = RandomForestClassifier(random_state=42)
grid_search_smote = GridSearchCV(rf_classifier_smote, param_grid, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_smote.fit(X_resampled_smote, y_resampled_smote)

best_reduced_model_smote = grid_search_smote.best_estimator_

print("Best parameters for SMOTE model:", grid_search_smote.best_params_)

y_pred_smote = best_reduced_model_smote.predict(X_test_scaled_selected)

report_smote = classification_report(y_test, y_pred_smote)
print("Classification Report for the tuned SMOTE model:")
print(report_smote)


param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'min_child_weight': [1, 3, 5],
}

# SMOTE
smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled_selected, y_train)

xgb_classifier_smote = XGBClassifier(random_state=42)
grid_search_smote_xgb = GridSearchCV(xgb_classifier_smote, param_grid_xgb, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_smote_xgb.fit(X_resampled_smote, y_resampled_smote)

best_reduced_model_smote_xgb = grid_search_smote_xgb.best_estimator_

print("Best parameters for SMOTE XGBoost model:", grid_search_smote_xgb.best_params_)

y_pred_smote_xgb = best_reduced_model_smote_xgb.predict(X_test_scaled_selected)

report_smote_xgb = classification_report(y_test, y_pred_smote_xgb)
print("Classification Report for the tuned SMOTE XGBoost model:")
print(report_smote_xgb)

In [None]:
y_pred_smote = best_reduced_model_smote.predict(X_train_scaled_selected)

report_smote = classification_report(y_train, y_pred_smote)
print("Classification Report for the tuned SMOTE model:")
print(report_smote)


y_pred_smote_xgb = best_reduced_model_smote_xgb.predict(X_train_scaled_selected)

report_smote_xgb = classification_report(y_train, y_pred_smote_xgb)
print("Classification Report for the tuned SMOTE XGBoost model:")
print(report_smote_xgb)

# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
import joblib

# Load the pre-trained models
best_reduced_model_knn = joblib.load("smote_reduced_KNN_model.joblib")
xgb_reduced_smote = best_reduced_model_smote_xgb
rf_reduced_smote= best_reduced_model_smote

voting_classifier = VotingClassifier(
    estimators=[
        ('xgb_smote', xgb_reduced_smote),
        ('rf_smote', rf_reduced_smote),
        ('KNN', best_reduced_model_knn)
    ],
    voting='soft'
)

voting_classifier.fit(X_resampled_smote, y_resampled_smote)

y_pred_ensemble = voting_classifier.predict(X_train_scaled_selected)

report_ensemble = classification_report(y_train, y_pred_ensemble)
print("Classification Report for the Ensemble Model:")
print(report_ensemble)
y_pred_ensemble = voting_classifier.predict(X_test_scaled_selected)

report_ensemble = classification_report(y_test, y_pred_ensemble)
print("Classification Report for the Ensemble Model:")
print(report_ensemble)

### Partial Dependence plots 

In [None]:
from sklearn.inspection import PartialDependenceDisplay

import matplotlib.pyplot as plt
features_to_plot = [    'Q1 Balance', 'Q2 Balance', 'Q3 Balance','Q4 Balance', 
    'Average Balance', 'Credit Rating_2', 'Mailer Type_Postcard',
    'Reward_Cash Back','Credit Rating_1',  '# Credit Cards Held']

disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['Q1 Balance'])
plt.show()
disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['Q2 Balance'])
plt.show()
disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['Q3 Balance'])
plt.show()
disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['Q4 Balance'])
plt.show()
disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['Average Balance'])
plt.show()
disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['Credit Rating_2'])
plt.show()
disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['Mailer Type_Postcard'])
plt.show()
disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['Reward_Cash Back'])
plt.show()
disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['# Credit Cards Held'])
plt.show()
disp1 = PartialDependenceDisplay.from_estimator(best_reduced_model_smote, X_train_scaled_selected,
                                                ['Credit Rating_1'])
plt.show()