# Final Project - Credit Card Offer Acceptance in Banking

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('creditcardmarketing-bbm.csv')

In [3]:
df.head()

Unnamed: 0,index,Customer Number,Offer Accepted,Reward,Mailer Type,Income Level,# Bank Accounts Open,Overdraft Protection,Credit Rating,# Credit Cards Held,# Homes Owned,Household Size,Own Your Home,Average Balance,Q1 Balance,Q2 Balance,Q3 Balance,Q4 Balance
0,0,1,No,Air Miles,Letter,High,1,No,High,2,1,4,No,1160.75,1669.0,877.0,1095.0,1002.0
1,1,2,No,Air Miles,Letter,Medium,1,No,Medium,2,2,5,Yes,147.25,39.0,106.0,78.0,366.0
2,2,3,No,Air Miles,Postcard,High,2,No,Medium,2,1,2,Yes,276.5,367.0,352.0,145.0,242.0
3,3,4,No,Air Miles,Letter,Medium,2,No,High,1,1,4,No,1219.0,1578.0,1760.0,1119.0,419.0
4,4,5,No,Air Miles,Letter,Medium,1,No,Medium,2,1,6,Yes,1211.0,2140.0,1357.0,982.0,365.0


In [4]:
df.columns

Index(['index', 'Customer Number', 'Offer Accepted', 'Reward', 'Mailer Type',
       'Income Level', '# Bank Accounts Open', 'Overdraft Protection',
       'Credit Rating', '# Credit Cards Held', '# Homes Owned',
       'Household Size', 'Own Your Home', 'Average Balance', 'Q1 Balance',
       'Q2 Balance', 'Q3 Balance', 'Q4 Balance'],
      dtype='object')

In [5]:
df.shape

(18000, 18)

In [6]:
duplicate_rows = df[df.duplicated()]
len(duplicate_rows)

0

In [7]:
df['Offer Accepted'].value_counts()

No     16977
Yes     1023
Name: Offer Accepted, dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 18000 non-null  int64  
 1   Customer Number       18000 non-null  int64  
 2   Offer Accepted        18000 non-null  object 
 3   Reward                18000 non-null  object 
 4   Mailer Type           18000 non-null  object 
 5   Income Level          18000 non-null  object 
 6   # Bank Accounts Open  18000 non-null  int64  
 7   Overdraft Protection  18000 non-null  object 
 8   Credit Rating         18000 non-null  object 
 9   # Credit Cards Held   18000 non-null  int64  
 10  # Homes Owned         18000 non-null  int64  
 11  Household Size        18000 non-null  int64  
 12  Own Your Home         18000 non-null  object 
 13  Average Balance       17976 non-null  float64
 14  Q1 Balance            17976 non-null  float64
 15  Q2 Balance         

In [9]:
df.isna().sum()

index                    0
Customer Number          0
Offer Accepted           0
Reward                   0
Mailer Type              0
Income Level             0
# Bank Accounts Open     0
Overdraft Protection     0
Credit Rating            0
# Credit Cards Held      0
# Homes Owned            0
Household Size           0
Own Your Home            0
Average Balance         24
Q1 Balance              24
Q2 Balance              24
Q3 Balance              24
Q4 Balance              24
dtype: int64

In [10]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}':")
    print(unique_values)
    print()

Unique values in column 'index':
[    0     1     2 ... 17997 17998 17999]

Unique values in column 'Customer Number':
[    1     2     3 ... 17998 17999 18000]

Unique values in column 'Offer Accepted':
['No' 'Yes']

Unique values in column 'Reward':
['Air Miles' 'Cash Back' 'Points']

Unique values in column 'Mailer Type':
['Letter' 'Postcard']

Unique values in column 'Income Level':
['High' 'Medium' 'Low']

Unique values in column '# Bank Accounts Open':
[1 2 3]

Unique values in column 'Overdraft Protection':
['No' 'Yes']

Unique values in column 'Credit Rating':
['High' 'Medium' 'Low']

Unique values in column '# Credit Cards Held':
[2 1 3 4]

Unique values in column '# Homes Owned':
[1 2 3]

Unique values in column 'Household Size':
[4 5 2 6 3 1 8 9]

Unique values in column 'Own Your Home':
['No' 'Yes']

Unique values in column 'Average Balance':
[1160.75  147.25  276.5  ...  691.   1649.   1427.25]

Unique values in column 'Q1 Balance':
[1669.   39.  367. ... 1122. 2139. 1070.

In [11]:
df.dtypes

index                     int64
Customer Number           int64
Offer Accepted           object
Reward                   object
Mailer Type              object
Income Level             object
# Bank Accounts Open      int64
Overdraft Protection     object
Credit Rating            object
# Credit Cards Held       int64
# Homes Owned             int64
Household Size            int64
Own Your Home            object
Average Balance         float64
Q1 Balance              float64
Q2 Balance              float64
Q3 Balance              float64
Q4 Balance              float64
dtype: object

In [12]:
for column in df.columns:
    value_counts = df[column].value_counts()
    print(f"Value counts in column '{column}':")
    print(value_counts)
    print()

Value counts in column 'index':
0        1
11998    1
12004    1
12003    1
12002    1
        ..
6005     1
6006     1
6007     1
6008     1
17999    1
Name: index, Length: 18000, dtype: int64

Value counts in column 'Customer Number':
1        1
11999    1
12005    1
12004    1
12003    1
        ..
6006     1
6007     1
6008     1
6009     1
18000    1
Name: Customer Number, Length: 18000, dtype: int64

Value counts in column 'Offer Accepted':
No     16977
Yes     1023
Name: Offer Accepted, dtype: int64

Value counts in column 'Reward':
Air Miles    6061
Cash Back    5999
Points       5940
Name: Reward, dtype: int64

Value counts in column 'Mailer Type':
Postcard    9147
Letter      8853
Name: Mailer Type, dtype: int64

Value counts in column 'Income Level':
Medium    9013
High      4526
Low       4461
Name: Income Level, dtype: int64

Value counts in column '# Bank Accounts Open':
1    13692
2     4012
3      296
Name: # Bank Accounts Open, dtype: int64

Value counts in column 'Ove

In [13]:
def categorize_size(size):
    if size in [1, 2, 3, 4]:
        return "Small"
    else:
        return "Big"
df['Household Size'] = df['Household Size'].apply(categorize_size)

In [14]:
df = df.drop('index',axis=1)

In [15]:
df['Offer Accepted'] = pd.Categorical(df['Offer Accepted'], categories=['No', 'Yes'])
df['Reward'] = pd.Categorical(df['Reward'], categories=['Air Miles', 'Cash Back' ,'Points'])
df['Mailer Type'] = pd.Categorical(df['Mailer Type'], categories=['Letter' ,'Postcard'])
df['Overdraft Protection'] = pd.Categorical(df['Overdraft Protection'], categories=['No' ,'Yes'])
df['Credit Rating'] = pd.Categorical(df['Credit Rating'], categories=['Low', 'Medium', 'High'],ordered=True)
df['Household Size'] = pd.Categorical(df['Household Size'], categories=['Small', 'Big'],ordered=True)
df['Own Your Home'] = pd.Categorical(df['Own Your Home'], categories=['No', 'Yes'])
df['Income Level'] = pd.Categorical(df['Income Level'], categories=['Low', 'Medium', 'High'],ordered=True)

In [16]:
from sklearn.model_selection import train_test_split
trainset, testset = train_test_split(df, test_size=0.2, random_state=100)

print("Training set shape:", trainset.shape)
print("Test set shape:",testset.shape)

Training set shape: (14400, 17)
Test set shape: (3600, 17)


In [17]:
trainset['Offer Accepted'].value_counts()

No     13552
Yes      848
Name: Offer Accepted, dtype: int64

In [18]:
testset['Offer Accepted'].value_counts()

No     3425
Yes     175
Name: Offer Accepted, dtype: int64

In [19]:
trainset.dtypes

Customer Number            int64
Offer Accepted          category
Reward                  category
Mailer Type             category
Income Level            category
# Bank Accounts Open       int64
Overdraft Protection    category
Credit Rating           category
# Credit Cards Held        int64
# Homes Owned              int64
Household Size          category
Own Your Home           category
Average Balance          float64
Q1 Balance               float64
Q2 Balance               float64
Q3 Balance               float64
Q4 Balance               float64
dtype: object

In [20]:
for column in trainset.columns:
    value_counts = trainset[column].value_counts()
    print(f"Value counts in column '{column}':")
    print(value_counts)
    print()

Value counts in column 'Customer Number':
6554     1
10485    1
1123     1
8680     1
16899    1
        ..
8422     1
1214     1
6466     1
7446     1
5641     1
Name: Customer Number, Length: 14400, dtype: int64

Value counts in column 'Offer Accepted':
No     13552
Yes      848
Name: Offer Accepted, dtype: int64

Value counts in column 'Reward':
Air Miles    4812
Cash Back    4808
Points       4780
Name: Reward, dtype: int64

Value counts in column 'Mailer Type':
Postcard    7326
Letter      7074
Name: Mailer Type, dtype: int64

Value counts in column 'Income Level':
Medium    7208
High      3622
Low       3570
Name: Income Level, dtype: int64

Value counts in column '# Bank Accounts Open':
1    10970
2     3192
3      238
Name: # Bank Accounts Open, dtype: int64

Value counts in column 'Overdraft Protection':
No     12254
Yes     2146
Name: Overdraft Protection, dtype: int64

Value counts in column 'Credit Rating':
High      4875
Medium    4771
Low       4754
Name: Credit Rating, d

In [21]:
for column in testset.columns:
    value_counts = testset[column].value_counts()
    print(f"Value counts in column '{column}':")
    print(value_counts)
    print()

Value counts in column 'Customer Number':
3305     1
4661     1
12092    1
994      1
6825     1
        ..
11127    1
866      1
4601     1
3912     1
10064    1
Name: Customer Number, Length: 3600, dtype: int64

Value counts in column 'Offer Accepted':
No     3425
Yes     175
Name: Offer Accepted, dtype: int64

Value counts in column 'Reward':
Air Miles    1249
Cash Back    1191
Points       1160
Name: Reward, dtype: int64

Value counts in column 'Mailer Type':
Postcard    1821
Letter      1779
Name: Mailer Type, dtype: int64

Value counts in column 'Income Level':
Medium    1805
High       904
Low        891
Name: Income Level, dtype: int64

Value counts in column '# Bank Accounts Open':
1    2722
2     820
3      58
Name: # Bank Accounts Open, dtype: int64

Value counts in column 'Overdraft Protection':
No     3068
Yes     532
Name: Overdraft Protection, dtype: int64

Value counts in column 'Credit Rating':
Low       1203
Medium    1201
High      1196
Name: Credit Rating, dtype: in

In [22]:
trainset.isna().sum()

Customer Number          0
Offer Accepted           0
Reward                   0
Mailer Type              0
Income Level             0
# Bank Accounts Open     0
Overdraft Protection     0
Credit Rating            0
# Credit Cards Held      0
# Homes Owned            0
Household Size           0
Own Your Home            0
Average Balance         20
Q1 Balance              20
Q2 Balance              20
Q3 Balance              20
Q4 Balance              20
dtype: int64

In [23]:
testset.isna().sum()

Customer Number         0
Offer Accepted          0
Reward                  0
Mailer Type             0
Income Level            0
# Bank Accounts Open    0
Overdraft Protection    0
Credit Rating           0
# Credit Cards Held     0
# Homes Owned           0
Household Size          0
Own Your Home           0
Average Balance         4
Q1 Balance              4
Q2 Balance              4
Q3 Balance              4
Q4 Balance              4
dtype: int64

In [24]:
#Imputing missing values
Average_Balance_mean=trainset['Average Balance'].mean()
Q1_Balance_mean=trainset['Q1 Balance'].mean()
Q2_Balance_mean=trainset['Q2 Balance'].mean()
Q3_Balance_mean=trainset['Q3 Balance'].mean()
Q4_Balance_mean=trainset['Q4 Balance'].mean()

trainset['Average Balance'].fillna(Average_Balance_mean, inplace=True)
trainset['Q1 Balance'].fillna(Q1_Balance_mean, inplace=True)
trainset['Q2 Balance'].fillna(Q2_Balance_mean, inplace=True)
trainset['Q3 Balance'].fillna(Q3_Balance_mean, inplace=True)
trainset['Q4 Balance'].fillna(Q4_Balance_mean, inplace=True)

testset['Average Balance'].fillna(Average_Balance_mean, inplace=True)
testset['Q1 Balance'].fillna(Q1_Balance_mean, inplace=True)
testset['Q2 Balance'].fillna(Q2_Balance_mean, inplace=True)
testset['Q3 Balance'].fillna(Q3_Balance_mean, inplace=True)
testset['Q4 Balance'].fillna(Q4_Balance_mean, inplace=True)

In [25]:
trainset.isna().sum()

Customer Number         0
Offer Accepted          0
Reward                  0
Mailer Type             0
Income Level            0
# Bank Accounts Open    0
Overdraft Protection    0
Credit Rating           0
# Credit Cards Held     0
# Homes Owned           0
Household Size          0
Own Your Home           0
Average Balance         0
Q1 Balance              0
Q2 Balance              0
Q3 Balance              0
Q4 Balance              0
dtype: int64

# Advanced Analysis

In [26]:
trainset['Offer Accepted'] = trainset['Offer Accepted'].replace({'No': 0, 'Yes': 1})
testset['Offer Accepted'] = testset['Offer Accepted'].replace({'No': 0, 'Yes': 1})

In [28]:
X_train = trainset.drop('Offer Accepted', axis=1)
y_train = trainset['Offer Accepted']
X_test = testset.drop('Offer Accepted', axis=1)
y_test = testset['Offer Accepted']

In [29]:
ordinal_mapping = {
    'Credit Rating': ['Low', 'Medium', 'High'],
    'Household Size': ['Small', 'Big'],
    'Income Level': ['Low', 'Medium', 'High']
}

for column, categories in ordinal_mapping.items():
    X_train[column] = pd.Categorical(X_train[column], categories=categories, ordered=True).codes
    X_test[column] = pd.Categorical(X_test[column], categories=categories, ordered=True).codes

In [30]:
nominal_columns = ['Reward', 'Mailer Type', 'Income Level', 'Overdraft Protection', 'Credit Rating', 'Household Size', 'Own Your Home']

X_train = pd.get_dummies(X_train, columns=nominal_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=nominal_columns, drop_first=True)

In [32]:
X_train = X_train.drop(columns=['Customer Number'])
X_test = X_test.drop(columns=['Customer Number'])

### Model fitting

In [33]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['# Bank Accounts Open', '# Credit Cards Held', '# Homes Owned',
                     'Average Balance', 'Q1 Balance', 'Q2 Balance', 'Q3 Balance', 'Q4 Balance']


scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[numerical_columns] = scaler.fit_transform(X_train_scaled[numerical_columns])

X_test_scaled = X_test.copy()  
X_test_scaled[numerical_columns] = scaler.transform(X_test_scaled[numerical_columns])

In [34]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib

In [36]:
X_test_scaled

Unnamed: 0,# Bank Accounts Open,# Credit Cards Held,# Homes Owned,Average Balance,Q1 Balance,Q2 Balance,Q3 Balance,Q4 Balance,Reward_Cash Back,Reward_Points,Mailer Type_Postcard,Income Level_1,Income Level_2,Overdraft Protection_Yes,Credit Rating_1,Credit Rating_2,Household Size_1,Own Your Home_Yes
3304,1.578587,-1.129544,-0.472823,0.141101,-0.963221,-1.288749,0.265788,2.200440,0,1,1,1,0,0,0,1,1,1
14086,-0.539532,0.125505,-0.472823,-0.221018,1.270771,0.085905,-0.686385,-1.347451,0,1,0,1,0,0,0,1,0,1
2649,1.578587,1.380554,-0.472823,-0.556605,-0.567180,-0.384025,-0.218460,-0.232043,1,0,0,0,0,0,0,1,1,1
916,1.578587,-1.129544,-0.472823,0.823032,2.019207,-0.111729,0.322011,-0.406939,0,0,1,1,0,1,1,0,1,0
15797,-0.539532,-1.129544,1.876608,1.208097,-0.378051,1.645016,1.767500,0.347970,0,0,1,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7631,-0.539532,0.125505,-0.472823,0.210657,-1.354412,-0.313755,1.373935,0.922628,0,0,0,1,0,0,1,0,0,1
7524,1.578587,0.125505,1.876608,-1.551895,-0.951905,-1.126250,-1.043676,-0.869164,1,0,0,0,1,1,0,0,0,0
3962,-0.539532,-1.129544,-0.472823,0.945650,-0.012724,0.670022,1.092817,0.747732,0,0,1,0,1,0,0,0,0,0
3612,1.578587,0.125505,-0.472823,0.249378,1.812296,-0.302775,-0.269242,-0.869164,0,0,1,1,0,0,1,0,0,1


In [37]:
from imblearn.over_sampling import SMOTE
# Assuming X_train_scaled is your scaled training data and y_train is your corresponding labels
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


In [38]:
selected_features = [
    'Q1 Balance', 'Q2 Balance', 'Q3 Balance','Q4 Balance', 
    'Average Balance', 'Credit Rating_2', 'Mailer Type_Postcard',
    'Reward_Cash Back','Credit Rating_1',  '# Credit Cards Held']

X_train_scaled_selected = X_train_scaled[selected_features]
X_test_scaled_selected = X_test_scaled[selected_features]

In [39]:
X_train_scaled_selected

Unnamed: 0,Q1 Balance,Q2 Balance,Q3 Balance,Q4 Balance,Average Balance,Credit Rating_2,Mailer Type_Postcard,Reward_Cash Back,Credit Rating_1,# Credit Cards Held
6553,0.983036,0.197897,-0.167677,-0.772793,0.123891,1,1,0,0,1.380554
9951,1.180248,0.454821,0.543278,-0.508664,0.682487,0,1,0,1,-1.129544
9100,-0.840367,-1.268985,-1.469887,-0.883441,-1.723274,1,1,0,0,0.125505
12649,-0.119412,-1.244830,0.135204,2.193301,0.475254,0,1,0,1,1.380554
2840,-0.492822,-0.465274,-0.106013,0.135596,-0.357978,1,1,0,0,-1.129544
...,...,...,...,...,...,...,...,...,...,...
16304,0.803605,1.003804,-1.083577,-1.156493,-0.208828,0,0,1,1,-1.129544
79,1.126903,1.636233,1.682257,1.427685,2.272942,0,1,1,0,-1.129544
12119,-0.680335,-0.067811,1.513587,0.985091,0.670297,1,1,1,0,-1.129544
14147,-1.058594,0.865460,2.074008,-0.665713,0.365543,0,0,0,1,0.125505


In [41]:
column_names = X_train_scaled_selected.columns
column_names 

Index(['Q1 Balance', 'Q2 Balance', 'Q3 Balance', 'Q4 Balance',
       'Average Balance', 'Credit Rating_2', 'Mailer Type_Postcard',
       'Reward_Cash Back', 'Credit Rating_1', '# Credit Cards Held'],
      dtype='object')

## Applying  balancing techniques

In [None]:
import numpy as np
from sklearn.metrics import classification_report, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
import pandas as pd
import joblib

resampling_methods = {
    'SMOTE': SMOTE(random_state=42)
}

classifiers = [
    ('Logistic Regression', LogisticRegression()),
    ('Logistic Ridge', LogisticRegression(penalty='l2', max_iter=1000)),
    ('Logistic Lasso', LogisticRegression(penalty='l1', solver='liblinear')),
    ('KNN', KNeighborsClassifier()),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42)),
    ('SVM Linear', SVC(kernel='linear', probability=True, random_state=42))
]

results = {}
modified_trained_models = {}
classification_reports = {}

for resampling_name, resampling_method in resampling_methods.items():
    print(f"Applying {resampling_name}...")
    
    X_resampled, y_resampled = resampling_method.fit_resample(X_train_scaled, y_train)
    
    for name, clf in classifiers:
        clf.fit(X_resampled, y_resampled)
        modified_trained_models[(resampling_name, name)] = clf 

        y_probs = clf.predict_proba(X_train_scaled)[:, 1]

        precision, recall, thresholds = precision_recall_curve(y_train, y_probs)

        pr_auc = auc(recall, precision)
        
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
        optimal_threshold = thresholds[np.argmax(f1_scores)]
        print(f"Optimal Threshold for {name}: {optimal_threshold}")
        
        y_pred_optimal = (y_probs >= optimal_threshold).astype(int)

        report = classification_report(y_train, y_pred_optimal)
        classification_reports[(resampling_name, name)] = report

for (resampling_name, model_name), model in modified_trained_models.items():
    filename = f"{resampling_name.lower()}_{model_name}_model.joblib"
    joblib.dump(model, filename)

for (resampling_name, model_name), report in classification_reports.items():
    print(f"\nClassification Report for {resampling_name} - {model_name}:\n{report}")


## Hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled, y_train)

rf_classifier_smote = RandomForestClassifier(random_state=42)
grid_search_smote = GridSearchCV(rf_classifier_smote, param_grid, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_smote.fit(X_resampled_smote, y_resampled_smote)

best_model_smote = grid_search_smote.best_estimator_

print("Best parameters for SMOTE model:", grid_search_smote.best_params_)

y_pred_smote = best_model_smote.predict(X_test_scaled)

report_smote = classification_report(y_test, y_pred_smote)
print("Classification Report for the tuned SMOTE model:")
print(report_smote)


undersampler = RandomUnderSampler(random_state=42)
X_resampled_undersampler, y_resampled_undersampler = undersampler.fit_resample(X_train_scaled, y_train)

rf_classifier_undersampler = RandomForestClassifier(random_state=42)
grid_search_undersampler = GridSearchCV(rf_classifier_undersampler, param_grid, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_undersampler.fit(X_resampled_undersampler, y_resampled_undersampler)

best_model_undersampler = grid_search_undersampler.best_estimator_


print("Best parameters for Random Undersampler model:", grid_search_undersampler.best_params_)

y_pred_undersampler = best_model_undersampler.predict(X_test_scaled)

report_undersampler = classification_report(y_test, y_pred_undersampler)
print("Classification Report for the tuned Random Undersampler model:")
print(report_undersampler)

In [None]:
y_pred_smote = best_model_smote.predict(X_train_scaled)

report_smote = classification_report(y_train, y_pred_smote)
print("Classification Report for the tuned SMOTE model:")
print(report_smote)

y_pred_undersampler = best_model_undersampler.predict(X_train_scaled)
report_undersampler = classification_report(y_train, y_pred_undersampler)
print("Classification Report for the tuned Random Undersampler model:")
print(report_undersampler)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import StratifiedKFold

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'min_child_weight': [1, 3, 5],
}

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled, y_train)

xgb_classifier_smote = XGBClassifier(random_state=42)
grid_search_smote_xgb = GridSearchCV(xgb_classifier_smote, param_grid_xgb, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_smote_xgb.fit(X_resampled_smote, y_resampled_smote)

best_model_smote_xgb = grid_search_smote_xgb.best_estimator_

print("Best parameters for SMOTE XGBoost model:", grid_search_smote_xgb.best_params_)

y_pred_smote_xgb = best_model_smote_xgb.predict(X_test_scaled)

report_smote_xgb = classification_report(y_test, y_pred_smote_xgb)
print("Classification Report for the tuned SMOTE XGBoost model:")
print(report_smote_xgb)

undersampler = RandomUnderSampler(random_state=42)
X_resampled_undersampler, y_resampled_undersampler = undersampler.fit_resample(X_train_scaled, y_train)

xgb_classifier_undersampler = XGBClassifier(random_state=42)
grid_search_undersampler_xgb = GridSearchCV(xgb_classifier_undersampler, param_grid_xgb, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_undersampler_xgb.fit(X_resampled_undersampler, y_resampled_undersampler)

best_model_undersampler_xgb = grid_search_undersampler_xgb.best_estimator_

print("Best parameters for Random Undersampler XGBoost model:", grid_search_undersampler_xgb.best_params_)

y_pred_undersampler_xgb = best_model_undersampler_xgb.predict(X_test_scaled)

report_undersampler_xgb = classification_report(y_test, y_pred_undersampler_xgb)
print("Classification Report for the tuned Random Undersampler XGBoost model:")
print(report_undersampler_xgb)

In [None]:
y_pred_smote = best_model_smote_xgb.predict(X_train_scaled)

report_smote = classification_report(y_train, y_pred_smote)
print("Classification Report for the tuned SMOTE model:")
print(report_smote)


y_pred_undersampler = best_model_undersampler_xgb.predict(X_train_scaled)

report_undersampler = classification_report(y_train, y_pred_undersampler)
print("Classification Report for the tuned Random Undersampler model:")
print(report_undersampler)

## Voting Classifier 

In [None]:
from sklearn.ensemble import VotingClassifier

xgb_smote_classifier = best_model_smote_xgb
rf_smote_classifier = best_model_smote

ensemble_classifier = VotingClassifier(
    estimators=[
        ('xgb_smote', xgb_smote_classifier),
        ('rf_smote', rf_smote_classifier)
    ],
    voting='soft' 
)

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled, y_train)
ensemble_classifier.fit(X_resampled_smote, y_resampled_smote)


y_pred_train = ensemble_classifier.predict(X_train_scaled)


report_ensemble = classification_report(y_train, y_pred_train)
print("Classification Report for the Ensemble Classifier:")
print(report_ensemble)


y_pred_test = ensemble_classifier.predict(X_test_scaled)

report_ensemble = classification_report(y_test, y_pred_test)
print("Classification Report for the Ensemble Classifier:")
print(report_ensemble)

# Reduced Models

In [None]:
X_train.columns

In [None]:
selected_features = [
    'Q1 Balance', 'Q2 Balance', 'Q3 Balance','Q4 Balance', 
    'Average Balance', 'Credit Rating_2', 'Mailer Type_Postcard',
    'Reward_Cash Back','Credit Rating_1',  '# Credit Cards Held']

X_train_scaled_selected = X_train_scaled[selected_features]
X_test_scaled_selected = X_test_scaled[selected_features]

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib


In [None]:
results = {}
classification_reports = {}

for name, clf in reduced_trained_models.items():
    y_pred = clf.predict(X_test_scaled_selected) 
    classification_report_text = classification_report(y_test, y_pred) 
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    classification_reports[name] = classification_report_text
    results[name] = {'Accuracy': accuracy, 'F1-Score': f1, 'Precision': precision, 'Recall': recall}

results_df = pd.DataFrame(results)
print(results_df)

for name, report in classification_reports.items():
    print(f"Classification Report for {name}:\n{report}")

In [None]:
import numpy as np
from sklearn.metrics import classification_report, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb
import pandas as pd
import joblib

resampling_methods = {
    'SMOTE': SMOTE(random_state=42),
    'RandomUnderSampler': RandomUnderSampler(random_state=42)
}

classifiers = [
    ('reduced_Logistic Regression', LogisticRegression(max_iter=1000)),
    ('reduced_Logistic Ridge', LogisticRegression(penalty='l2', max_iter=1000)),
    ('reduced_Logistic Lasso', LogisticRegression(penalty='l1',solver='liblinear')),
    ('reduced_KNN', KNeighborsClassifier()),
    ('reduced_Gaussian Naive Bayes', GaussianNB()),
    ('reduced_SVM Linear', SVC(kernel='linear', probability=True, random_state=42)),
    ('reduced_Random Forest', RandomForestClassifier(random_state=42)),
    ('reduced_XGBoost', XGBClassifier(random_state=42))
]

results = {}
modified_reduced_trained_models = {}
classification_reports = {}

for resampling_name, resampling_method in resampling_methods.items():
    print(f"Applying {resampling_name}...")
    
    X_resampled, y_resampled = resampling_method.fit_resample(X_train_scaled_selected, y_train)
    
    for name, clf in classifiers:
        clf.fit(X_resampled, y_resampled)
        modified_reduced_trained_models[(resampling_name, name)] = clf 

     
        y_probs = clf.predict_proba(X_train_scaled_selected)[:, 1]


        precision, recall, thresholds = precision_recall_curve(y_train, y_probs)


        pr_auc = auc(recall, precision)
        

        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
        optimal_threshold = thresholds[np.argmax(f1_scores)]
        print(f"Optimal Threshold for {name}: {optimal_threshold}")
        

        y_pred_optimal = (y_probs >= optimal_threshold).astype(int)


        report = classification_report(y_train, y_pred_optimal)
        classification_reports[(resampling_name, name)] = report

for (resampling_name, model_name), model in modified_reduced_trained_models.items():
    filename = f"{resampling_name.lower()}_{model_name}_model.joblib"
    joblib.dump(model, filename)


for (resampling_name, model_name), report in classification_reports.items():
    print(f"\nClassification Report for {resampling_name} - {model_name}:\n{report}")

In [None]:
import joblib
test_classification_reports = {}

for (resampling_name, model_name), clf in modified_reduced_trained_models.items():
    
    
    model_filename = f"{resampling_name.lower()}_{model_name}_model.joblib"
    trained_model = joblib.load(model_filename)

    y_probs_test = trained_model.predict_proba(X_test_scaled_selected)[:, 1]
    y_pred_optimal_test = (y_probs_test >= optimal_threshold).astype(int)

    test_report = classification_report(y_test, y_pred_optimal_test)
    test_classification_reports[(resampling_name, model_name)] = test_report


for (resampling_name, model_name), test_report in test_classification_reports.items():
    print(f"\nClassification Report for {resampling_name} - {model_name} (on test set):\n{test_report}")


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled_selected, y_train)

rf_classifier_smote = RandomForestClassifier(random_state=42)
grid_search_smote = GridSearchCV(rf_classifier_smote, param_grid, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_smote.fit(X_resampled_smote, y_resampled_smote)

best_reduced_model_smote = grid_search_smote.best_estimator_

print("Best parameters for SMOTE model:", grid_search_smote.best_params_)

y_pred_smote = best_reduced_model_smote.predict(X_test_scaled_selected)

report_smote = classification_report(y_test, y_pred_smote)
print("Classification Report for the tuned SMOTE model:")
print(report_smote)


param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'min_child_weight': [1, 3, 5],
}

# SMOTE
smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train_scaled_selected, y_train)

xgb_classifier_smote = XGBClassifier(random_state=42)
grid_search_smote_xgb = GridSearchCV(xgb_classifier_smote, param_grid_xgb, cv=StratifiedKFold(n_splits=5), scoring='f1', n_jobs=-1)
grid_search_smote_xgb.fit(X_resampled_smote, y_resampled_smote)

best_reduced_model_smote_xgb = grid_search_smote_xgb.best_estimator_

print("Best parameters for SMOTE XGBoost model:", grid_search_smote_xgb.best_params_)

y_pred_smote_xgb = best_reduced_model_smote_xgb.predict(X_test_scaled_selected)

report_smote_xgb = classification_report(y_test, y_pred_smote_xgb)
print("Classification Report for the tuned SMOTE XGBoost model:")
print(report_smote_xgb)

# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
import joblib

# Load the pre-trained models
best_reduced_model_knn = joblib.load("smote_reduced_KNN_model.joblib")
xgb_reduced_smote = best_reduced_model_smote_xgb
rf_reduced_smote= best_reduced_model_smote

voting_classifier = VotingClassifier(
    estimators=[
        ('xgb_smote', xgb_reduced_smote),
        ('rf_smote', rf_reduced_smote),
        ('KNN', best_reduced_model_knn)
    ],
    voting='soft'
)

voting_classifier.fit(X_resampled_smote, y_resampled_smote)

y_pred_ensemble = voting_classifier.predict(X_train_scaled_selected)

report_ensemble = classification_report(y_train, y_pred_ensemble)
print("Classification Report for the Ensemble Model:")
print(report_ensemble)
y_pred_ensemble = voting_classifier.predict(X_test_scaled_selected)

report_ensemble = classification_report(y_test, y_pred_ensemble)
print("Classification Report for the Ensemble Model:")
print(report_ensemble)

In [42]:
import sys
sys.executable

'C:\\Users\\vidur\\anaconda3\\python.exe'