<a href="https://colab.research.google.com/github/Requenamar3/datawrangling/blob/main/TemplateForBinaryClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from time import time
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier, Perceptron
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB, MultinomialNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/fenago/datasets/main/bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
# the 2 basic rules of ML is:  all data must be numeric... and no empties
# plus all of the data wrangling

df.fillna(0, inplace=True)
print(df.nunique())

age            77
job            12
marital         3
education       4
default         2
balance      7168
housing         2
loan            2
contact         3
day            31
month          12
duration     1573
campaign       48
pdays         559
previous       41
poutcome        4
y               2
dtype: int64


In [None]:
# if the target (class in the case of mushrooms) is text/object... then we can make the target numeric with the LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
le = LabelEncoder()

# Fit and transform the binary column
df['y'] = le.fit_transform(df['y'])

# Print the mapping
mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(mapping)

{'no': 0, 'yes': 1}


In [None]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [None]:
X = df.drop(['y'], axis=1)

In [None]:
y = df['y']

In [None]:
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown


In [None]:
X = pd.get_dummies(X)

In [None]:
X.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9043 entries, 3776 to 11677
Data columns (total 51 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  9043 non-null   int64
 1   balance              9043 non-null   int64
 2   day                  9043 non-null   int64
 3   duration             9043 non-null   int64
 4   campaign             9043 non-null   int64
 5   pdays                9043 non-null   int64
 6   previous             9043 non-null   int64
 7   job_admin.           9043 non-null   uint8
 8   job_blue-collar      9043 non-null   uint8
 9   job_entrepreneur     9043 non-null   uint8
 10  job_housemaid        9043 non-null   uint8
 11  job_management       9043 non-null   uint8
 12  job_retired          9043 non-null   uint8
 13  job_self-employed    9043 non-null   uint8
 14  job_services         9043 non-null   uint8
 15  job_student          9043 non-null   uint8
 16  job_technician      

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('SGD', SGDClassifier()))
models.append(('Ridge', RidgeClassifier()))
models.append(('PAC', PassiveAggressiveClassifier()))
models.append(('Perceptron', Perceptron()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NearestCentroid', NearestCentroid()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('ExtraTree', ExtraTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('BNB', BernoulliNB()))
# models.append(('ComplementNB', ComplementNB()))
# models.append(('MultinomialNB', MultinomialNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('NuSVC', NuSVC()))
models.append(('LinearSVC', LinearSVC()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('ExtraTrees', ExtraTreesClassifier()))
models.append(('Bagging', BaggingClassifier()))
models.append(('AdaBoost', AdaBoostClassifier()))
models.append(('MLP', MLPClassifier()))
models.append(('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')))

In [None]:
# Scoring metrics:
# 'accuracy', 'balanced_accuracy', 'average_precision', 'neg_brier_score', 'f1', 'f1_micro', 'f1_macro', 'f1_weighted',
# 'neg_log_loss', 'precision', 'recall', 'jaccard', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted'
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    start = time()
    kfold = KFold(n_splits=10, random_state=7, shuffle=True)
    model.fit(X_train, y_train)
    train_time = time() - start
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    predict_time = time() - start
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    print("Score for each of the 10 K-fold tests: ",cv_results)
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print()

fig = pyplot.figure(figsize=(15, 10))  # Change the numbers as needed
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names, rotation = 45)  # Added rotation for better visibility
pyplot.show()

LR: 0.901709 (0.003658)
Score for each of the 10 K-fold tests:  [0.90489356 0.89687586 0.90295825 0.90572297 0.90710534 0.89881117
 0.8954935  0.90185236 0.90348451 0.89988938]
LogisticRegression(solver='liblinear')
	Training time: 0.810s
	Prediction time: 8.504s

SGD: 0.750599 (0.205043)
Score for each of the 10 K-fold tests:  [0.88139342 0.86480509 0.88968759 0.87973459 0.72103954 0.38263754
 0.84932264 0.85540503 0.86144912 0.32051991]
SGDClassifier()
	Training time: 0.578s
	Prediction time: 7.524s

Ridge: 0.900216 (0.004240)
Score for each of the 10 K-fold tests:  [0.90047    0.89494056 0.90212884 0.90489356 0.90572297 0.89825823
 0.89134642 0.90240531 0.90293142 0.89905973]
RidgeClassifier()
	Training time: 0.066s
	Prediction time: 1.028s

PAC: 0.849980 (0.060019)
Score for each of the 10 K-fold tests:  [0.88581698 0.74564556 0.84959912 0.87696986 0.88941111 0.88692286
 0.8786287  0.7193807  0.88882743 0.87859513]
PassiveAggressiveClassifier()
	Training time: 0.409s
	Prediction ti



QDA: 0.872788 (0.006786)
Score for each of the 10 K-fold tests:  [0.8656345  0.86342273 0.87973459 0.86978159 0.8855405  0.88111695
 0.86839923 0.87005806 0.87361726 0.87057522]
QuadraticDiscriminantAnalysis()
	Training time: 0.213s
	Prediction time: 2.081s

KNN: 0.882659 (0.004588)
Score for each of the 10 K-fold tests:  [0.88084048 0.87779928 0.89189936 0.88471109 0.88941111 0.87669339
 0.880564   0.88332873 0.88053097 0.88080752]
KNeighborsClassifier()
	Training time: 0.017s
	Prediction time: 32.140s

NearestCentroid: 0.736203 (0.005223)
Score for each of the 10 K-fold tests:  [0.73513962 0.73265137 0.74094553 0.73679845 0.7442632  0.73901023
 0.72822781 0.72850429 0.73423673 0.74225664]
NearestCentroid()
	Training time: 0.023s
	Prediction time: 0.597s

CART: 0.874115 (0.005293)
Score for each of the 10 K-fold tests:  [0.87144042 0.87254631 0.87254631 0.87669339 0.88277578 0.8786287
 0.86508156 0.87641692 0.87859513 0.86642699]
DecisionTreeClassifier()
	Training time: 0.667s
	Predic

KeyboardInterrupt: 

In [None]:
url = "https://raw.githubusercontent.com/fenago/datasets/main/bank-full.csv"

In [None]:
import pandas as pd
bankData = pd.read_csv(url,sep=";")

In [None]:
bankData.sample(15)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
22602,53,services,married,secondary,no,1,no,no,cellular,22,aug,120,10,-1,0,unknown,no
41286,70,retired,married,secondary,no,1853,no,no,telephone,26,aug,314,1,-1,0,unknown,yes
5417,45,services,married,secondary,no,0,yes,yes,unknown,23,may,268,1,-1,0,unknown,no
38556,36,admin.,single,secondary,no,148,yes,no,cellular,15,may,1357,4,374,2,failure,yes
17403,35,blue-collar,single,primary,no,-631,yes,no,cellular,28,jul,124,3,-1,0,unknown,no
37562,28,admin.,single,secondary,no,20,yes,no,cellular,14,may,17,5,370,4,other,no
44816,35,management,married,unknown,no,2326,yes,yes,cellular,16,sep,319,1,-1,0,unknown,yes
43900,27,admin.,single,secondary,no,240,yes,yes,unknown,9,jun,46,1,-1,0,unknown,no
5859,29,technician,single,tertiary,no,354,yes,no,unknown,26,may,255,1,-1,0,unknown,no
25530,40,management,married,tertiary,no,12409,yes,no,cellular,19,nov,140,1,99,2,failure,no


In [None]:
from sklearn.preprocessing import RobustScaler
rob_scaler = RobustScaler()

In [None]:
# Converting each of the columns to scaled version
bankData['ageScaled'] = rob_scaler.fit_transform(bankData['age'].values.reshape(-1,1))
bankData['balScaled'] = rob_scaler.fit_transform(bankData['balance'].values.reshape(-1,1))
bankData['durScaled'] = rob_scaler.fit_transform(bankData['duration'].values.reshape(-1,1))

In [None]:
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,ageScaled,balScaled,durScaled
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,1.266667,1.25,0.375
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,0.333333,-0.308997,-0.134259
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,-0.4,-0.328909,-0.481481
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,0.533333,0.780236,-0.407407
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,-0.4,-0.329646,0.083333


In [None]:
# Dropping the original columns
bankData.drop(['age','balance','duration'], axis=1, inplace=True)

In [None]:
bankData.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,ageScaled,balScaled,durScaled
0,management,married,tertiary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,1.266667,1.25,0.375
1,technician,single,secondary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.333333,-0.308997,-0.134259
2,entrepreneur,married,secondary,no,yes,yes,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.328909,-0.481481
3,blue-collar,married,unknown,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.533333,0.780236,-0.407407
4,unknown,single,unknown,no,no,no,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.329646,0.083333


In [None]:
bankCat = pd.get_dummies(bankData[['job','marital','education','default','housing','loan','contact','month','poutcome']])

In [None]:
bankCat.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [None]:
bankNum = bankData[['ageScaled','balScaled','day','durScaled','campaign','pdays','previous']]

In [None]:
# Merging with the original data frame
# Preparing the X variables
X = pd.concat([bankCat, bankNum], axis=1)
print(X.shape)
# Preparing the Y variable
Y = bankData['y']
print(Y.shape)
X.head()

(45211, 51)
(45211,)


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_other,poutcome_success,poutcome_unknown,ageScaled,balScaled,day,durScaled,campaign,pdays,previous
0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,1.266667,1.25,5,0.375,1,-1,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0.333333,-0.308997,5,-0.134259,1,-1,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,1,-0.4,-0.328909,5,-0.481481,1,-1,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0.533333,0.780236,5,-0.407407,1,-1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,-0.4,-0.329646,5,0.083333,1,-1,0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)
# Defining the LogisticRegression function
bankModel = LogisticRegression(max_iter=100)

In [None]:
bankModel.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
pred = bankModel.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(bankModel.score(X_test, y_test)))

Accuracy of Logistic regression model prediction on test set: 0.90


In [None]:
# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

[[11700   298]
 [ 1086   480]]
              precision    recall  f1-score   support

          no       0.92      0.98      0.94     11998
         yes       0.62      0.31      0.41      1566

    accuracy                           0.90     13564
   macro avg       0.77      0.64      0.68     13564
weighted avg       0.88      0.90      0.88     13564



In [None]:
print('Percentage of negative class :',(y_train[y_train=='yes'].value_counts()/len(y_train) ) * 100)
print('Percentage of positive class :',(y_train[y_train=='no'].value_counts()/len(y_train) ) * 100)

Percentage of negative class : yes    11.764148
Name: y, dtype: float64
Percentage of positive class : no    88.235852
Name: y, dtype: float64


In [None]:
# Collect more data
# Resample the data
# --- Undersample (make the dataset the same size as the minority class (11%))
# --- Oversample (create "fake" data points so that the minority class equals the majority class)
# --- Modified (take the majority and drop it in half and we take the minority and we increase to the halfway point)

In [None]:
# Random Undersampling
# Let us first join the train_x and train_y for ease of operation
trainData = pd.concat([X_train,y_train],axis=1)

In [None]:
trainData.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,poutcome_unknown,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,y
19100,1,0,0,0,0,0,0,0,0,0,...,0,1,0.8,-0.162979,5,0.236111,1,-1,0,no
37958,1,0,0,0,0,0,0,0,0,0,...,0,0,0.733333,-0.238938,14,0.865741,2,289,19,no
12451,0,1,0,0,0,0,0,0,0,0,...,0,1,0.0,0.385693,1,1.347222,3,-1,0,no
18263,0,0,0,0,1,0,0,0,0,0,...,0,1,1.333333,-0.330383,31,-0.592593,8,-1,0,no
5128,0,0,0,0,0,0,0,1,0,0,...,0,1,-0.466667,-0.14233,21,-0.435185,2,-1,0,no


In [None]:
# pull out all of the "yes" or minority values
ind = trainData[trainData['y']=='yes'].index
print(len(ind))

3723


In [None]:
# Separate the minority class
minData = trainData.loc[ind]
print(minData.shape)

(3723, 52)


In [None]:
# now the majority
ind1 = trainData[trainData['y']=='no'].index
print(len(ind1))

27924


In [None]:
# Separate the majority class
majData = trainData.loc[ind1]
print(majData.shape)
majData.head()

(27924, 52)


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,poutcome_unknown,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,y
19100,1,0,0,0,0,0,0,0,0,0,...,0,1,0.8,-0.162979,5,0.236111,1,-1,0,no
37958,1,0,0,0,0,0,0,0,0,0,...,0,0,0.733333,-0.238938,14,0.865741,2,289,19,no
12451,0,1,0,0,0,0,0,0,0,0,...,0,1,0.0,0.385693,1,1.347222,3,-1,0,no
18263,0,0,0,0,1,0,0,0,0,0,...,0,1,1.333333,-0.330383,31,-0.592593,8,-1,0,no
5128,0,0,0,0,0,0,0,1,0,0,...,0,1,-0.466667,-0.14233,21,-0.435185,2,-1,0,no


In [None]:
majSample=majData.sample(n=len(ind),random_state=123)

In [None]:
print(majSample.shape)
majSample.head()

(3723, 52)


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,poutcome_unknown,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,y
17387,0,0,0,0,1,0,0,0,0,0,...,0,1,0.666667,0.752212,28,-0.425926,3,-1,0,no
34679,0,1,0,0,0,0,0,0,0,0,...,0,0,0.8,0.086283,5,-0.106481,7,250,3,no
26572,1,0,0,0,0,0,0,0,0,0,...,0,1,0.466667,1.785398,20,-0.134259,2,-1,0,no
3280,0,0,0,0,0,1,0,0,0,0,...,0,1,1.2,1.972714,15,-0.009259,1,-1,0,no
4434,0,0,0,0,1,0,0,0,0,0,...,0,1,-0.133333,2.011062,20,-0.055556,1,-1,0,no


In [None]:
# Concatenating both data sets and then shuffling the data set
balData = pd.concat([minData,majSample],axis = 0)

In [None]:
# Shuffling the data set
from sklearn.utils import shuffle
balData = shuffle(balData)
balData.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_success,poutcome_unknown,ageScaled,balScaled,day,durScaled,campaign,pdays,previous,y
4434,0,0,0,0,1,0,0,0,0,0,...,0,1,-0.133333,2.011062,20,-0.055556,1,-1,0,no
39888,0,0,0,0,0,0,0,0,0,1,...,0,1,0.0,0.839971,2,-0.041667,3,-1,0,yes
37590,0,1,0,0,0,0,0,0,0,0,...,0,1,-0.266667,-0.140855,14,0.027778,1,-1,0,no
40668,0,0,0,0,0,0,0,1,0,0,...,0,1,1.2,-0.330383,6,0.810185,1,-1,0,yes
26954,0,1,0,0,0,0,0,0,0,0,...,0,1,0.466667,-0.168879,21,2.541667,1,-1,0,yes


In [None]:
# Making the new X_train and y_train
X_trainNew = balData.iloc[:,0:51]
print(X_trainNew.head())
y_trainNew = balData['y']
print(y_trainNew.head())

       job_admin.  job_blue-collar  job_entrepreneur  job_housemaid  \
4434            0                0                 0              0   
39888           0                0                 0              0   
37590           0                1                 0              0   
40668           0                0                 0              0   
26954           0                1                 0              0   

       job_management  job_retired  job_self-employed  job_services  \
4434                1            0                  0             0   
39888               0            0                  0             0   
37590               0            0                  0             0   
40668               0            0                  0             1   
26954               0            0                  0             0   

       job_student  job_technician  ...  poutcome_other  poutcome_success  \
4434             0               0  ...               0              

In [None]:
from sklearn.linear_model import LogisticRegression
bankModel1 = LogisticRegression()
bankModel1.fit(X_trainNew, y_trainNew)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
pred = bankModel1.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set for balanced data set: {:.2f}'.format(bankModel1.score(X_test, y_test)))

Accuracy of Logistic regression model prediction on test set for balanced data set: 0.83


In [None]:
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

[[9969 2029]
 [ 278 1288]]
              precision    recall  f1-score   support

          no       0.97      0.83      0.90     11998
         yes       0.39      0.82      0.53      1566

    accuracy                           0.83     13564
   macro avg       0.68      0.83      0.71     13564
weighted avg       0.91      0.83      0.85     13564



In [None]:
# Oversampling
!pip install smote-variants

In [None]:
# Shape before oversampling
print("Before OverSampling count of yes: {}".format(sum(y_train=='yes')))
print("Before OverSampling count of no: {} \n".format(sum(y_train=='no')))

Before OverSampling count of yes: 3723
Before OverSampling count of no: 27924 



In [None]:
import smote_variants as sv
import numpy as np

In [None]:
# Instantiating the SMOTE class
oversampler= sv.SMOTE()

In [None]:
# Creating new training set
X_train_os, y_train_os = oversampler.sample(np.array(X_train), np.array(y_train))

2024-02-13 01:36:36,588:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
INFO:smote_variants:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'nn_params': {}, 'n_jobs': 1, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'random', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'SMOTE'}")
2024-02-13 01:36:36,639:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
INFO:smote_variants:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-02-13 01:36:36,656:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
INFO:smote_variants:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2024-02-13 01:36:36,943:INFO:SMOTE: simplex sampl

In [None]:
# Shape after oversampling
print('After OverSampling, the shape of train_X: {}'.format(X_train_os.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_os.shape))
print("After OverSampling, counts of label 'Yes': {}".format(sum(y_train_os=='yes')))
print("After OverSampling, counts of label 'no': {}".format(sum(y_train_os=='no')))

After OverSampling, the shape of train_X: (55848, 51)
After OverSampling, the shape of train_y: (55848,) 

After OverSampling, counts of label 'Yes': 27924
After OverSampling, counts of label 'no': 27924


In [None]:
# Training the model with Logistic regression model
from sklearn.linear_model import LogisticRegression
bankModel2 = LogisticRegression()
bankModel2.fit(X_train_os, y_train_os)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
pred = bankModel2.predict(X_test)



In [None]:
print('Accuracy of Logistic regression model prediction on test set for Smote balanced data set: {:.2f}'.format(bankModel2.score(X_test, y_test)))

Accuracy of Logistic regression model prediction on test set for Smote balanced data set: 0.84




In [None]:
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

[[10152  1846]
 [  325  1241]]
              precision    recall  f1-score   support

          no       0.97      0.85      0.90     11998
         yes       0.40      0.79      0.53      1566

    accuracy                           0.84     13564
   macro avg       0.69      0.82      0.72     13564
weighted avg       0.90      0.84      0.86     13564

