# 1. Exploring the dataset

In [69]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score, average_precision_score, recall_score, f1_score,\
roc_auc_score, roc_curve, accuracy_score, classification_report, precision_recall_curve

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline

from collections import Counter

In [2]:
full_df = pd.read_csv('../input/creditcardfraud/creditcard.csv')

In [3]:
full_df.info()


#### There are no null values in any column

#### All columns are encrypted and no logical sense could be made out of them, so resorting to statistical analysis to get forward in preprocessing this data



In [4]:
full_df.head()

In [5]:
full_df.describe()

## Plotting Class, Time and Amount columns to get sense of their distribution

#### 1. Countplot for Class/Target column

In [6]:
print('Fraud datapoints = ', round(sum(full_df['Class'])/len(full_df)*100, 2),'% of Total datapoints, hence HIGHLY IMBALANCED')

In [7]:
plt.figure(figsize=(8,5))
ax = sns.countplot(x = full_df['Class'], data = full_df)
ax.set_xticklabels(['Non-Fraud', 'Fraud'])
ax.set_title('Distibution of Non-Fraud and Fraud instances')
plt.tight_layout()
plt.show()

#### 2. Fraud Amount vs Time

In [8]:
plt.figure(figsize=(18,7))
ax = sns.scatterplot(x=full_df['Time'], y=full_df['Class']*full_df['Amount'])
ax.set(ylabel = 'Fraudulent Amount')

# 2. Scaling the dataset

As most of the columns are already scaled with mean ~ 0 and std ~ 1, the only columns remaining to scale are time and amount.

Using Robust Scaler to scale these columns as it is less prone to outliers as compared to Standard Scaler

In [9]:
rob_scaler = RobustScaler()

full_df['Scaled Amount'] = rob_scaler.fit_transform(full_df['Amount'].values.reshape(-1,1))
full_df['Scaled Time'] = rob_scaler.fit_transform(full_df['Time'].values.reshape(-1,1))

full_df.drop(['Time','Amount'], axis=1, inplace=True)

# 3. Splitting the data into Train and Test

In [10]:
X = full_df.drop('Class', axis=1)
y = full_df['Class']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]


# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

# 4. Performing Random Undersampling

In [11]:
suffled_df = full_df.sample(frac=1)

fraud_df = suffled_df.loc[suffled_df['Class'] == 1]
non_fraud_df = suffled_df.loc[suffled_df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

new_df = normal_distributed_df.sample(frac=1, random_state=42)

new_df.head()

In [12]:
print('Distribution of the Classes in the subsample dataset')
print(new_df['Class'].value_counts()/len(new_df))

sns.countplot(x=new_df['Class'], data=new_df)
plt.title('Equally Distributed Classes', fontsize=14)
plt.show()

## Exploring the Undersampled Data

## Correlation Matrices
#### 1. Original Data

In [13]:
ax1 = plt.figure(figsize=(30,15))

# Entire DataFrame
corr = full_df.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':10}, annot=True)
plt.title("Imbalanced Correlation Matrix of Imbalanced Original Data", fontsize=14)

#### 2. Undersampled Data

In [14]:
ax2 = plt.figure(figsize=(30,15))

# Entire DataFrame
corr = new_df.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':10}, annot=True)
plt.title("Balanced Correlation Matrix of Undersampled Data", fontsize=14)

In [15]:
f, axes = plt.subplots(ncols=3, nrows=2, figsize=(20,10))

# Negative Correlations with our Class (The lower our feature value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V9", data=new_df, ax=axes[0][0])
axes[0][0].set_title('V9 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V10", data=new_df, ax=axes[0][1])
axes[0][1].set_title('V10 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V12", data=new_df,  ax=axes[0][2])
axes[0][2].set_title('V12 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V14", data=new_df, ax=axes[1][0])
axes[1][0].set_title('V14 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V16", data=new_df, ax=axes[1][1])
axes[1][1].set_title('V14 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V17", data=new_df, ax=axes[1][2])
axes[1][2].set_title('V17 vs Class Negative Correlation')

plt.show()

In [16]:
f, axes = plt.subplots(ncols=3, figsize=(20,4))

# Negative Correlations with our Class (The lower our feature value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V2", data=new_df, ax=axes[0])
axes[0].set_title('V2 vs Class Positive Correlation')

sns.boxplot(x="Class", y="V4", data=new_df, ax=axes[1])
axes[1].set_title('V4 vs Class Positive Correlation')

sns.boxplot(x="Class", y="V11", data=new_df,  ax=axes[2])
axes[2].set_title('V11 vs Class Positive Correlation')

plt.show()

#### 3. Removing Outliers

These columns have positive correlations with the target class along with corr. value : 

V2  -> +0.5  
V4  -> +0.73 (max positive corr.)  
V11 -> +0.68  

These columns have positive correlations with the target class along with corr. value : 

V9  -> -0.55  
V10 -> -0.63  
V12 -> -0.68  
V14 -> -0.75 (max negative corr.)  
V16 -> -0.59  
V17 -> -0.56  

In [17]:
def remove_outliers(colname, threshold, new_df):
    print('\x1b[5;30;42m', colname, '\x1b[0m')
    vx_fraud = new_df[colname].loc[new_df['Class'] == 1].values
    q25, q75 = np.percentile(vx_fraud, 25), np.percentile(vx_fraud, 75)
    vx_iqr = q75 - q25

    vx_cut_off = vx_iqr * threshold
    vx_lower, vx_upper = q25 - vx_cut_off, q75 + vx_cut_off
    
    print(colname,'Lower: ',vx_lower)
    print(colname,'Upper: ',vx_upper)

    outliers = [x for x in vx_fraud if x < vx_lower or x > vx_upper]
    print(colname,'Outliers: ',outliers)
    print(colname,'Outliers for Fraud Cases: ',len(outliers))
    new_df = new_df.drop(new_df[(new_df[colname] > vx_upper) | (new_df[colname] < vx_lower)].index)
    print('Number of Instances after outliers removal: ',len(new_df))
    print()
    return new_df

In [18]:
print('Outlier removal from :')
pos_corr_cols = ['V2', 'V4', 'V11']
neg_corr_cols = ['V9', 'V10', 'V12', 'V14', 'V16', 'V17']

print('\x1b[5;30;41m', 'POSITIVE Corr. cols', '\x1b[0m')
for cols in pos_corr_cols:
    new_df = remove_outliers(cols, 1.5, new_df)

print('\x1b[5;30;41m', 'NEGATIVE Corr. cols', '\x1b[0m')
for cols in neg_corr_cols:
    new_df = remove_outliers(cols, 1.5, new_df)

## Visualizing Data and Performing Dimensionality Reduction using PCA and T-SNE

In [19]:
def dim_reduction(algo, X, y):
    if(algo=='T-SNE'):
        X_reduced = TSNE(n_components=2, random_state=42).fit_transform(X.values)
    
    else:
        X_reduced = PCA(n_components=2, random_state=42).fit_transform(X.values)
    
    ax = plt.figure(figsize=(12,8))

#     ax.suptitle('Cluster after Dimensionality Reduction using '+ algo, fontsize=14)


    purple_patch = mpatches.Patch(color='purple', label='No Fraud')
    yellow_patch = mpatches.Patch(color='yellow', label='Fraud')

#     sns.diverging_palette(250, 20, as_cmap=True)
    # t-SNE scatter plot
    sns.scatterplot(X_reduced[:,0], X_reduced[:,1], c=(y == 0), 
                     label='No Fraud', linewidths=2).set_title(algo, fontsize=14)
    sns.scatterplot(X_reduced[:,0], X_reduced[:,1], c=(y == 1), 
                     label='Fraud', linewidths=2).set_title(algo, fontsize=14)
    
    plt.grid()

    plt.legend(handles=[purple_patch, yellow_patch])
    plt.show()

In [20]:
X = new_df.drop('Class', axis=1)
y = new_df['Class']

dim_reduction('T-SNE', X, y)
dim_reduction('PCA', X, y)

## Training Classifier ML algorithms on Random Undersampled Data

In [21]:
classifiers = {
    "Logisitic_Regr": LogisticRegression(),
    "K-NN": KNeighborsClassifier(),
    "SVM": SVC()}

X = new_df.drop('Class', axis=1)
y = new_df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifiers: ", classifier.__class__.__name__, 
          ", training score of :", round(training_score.mean(), 2) * 100, "% accuracy score")

In [22]:
# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV


# Logistic Regression 
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}



grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_train, y_train)
# We automatically get the logistic regression with the best parameters.
log_reg = grid_log_reg.best_estimator_

knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params)
grid_knears.fit(X_train, y_train)
# KNears best estimator
knears_neighbors = grid_knears.best_estimator_

# Support Vector Classifier
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train, y_train)

# SVC best estimator
svc = grid_svc.best_estimator_

In [24]:
# Overfitting Case

log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
print('Logistic Regression Cross Validation Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')


knears_score = cross_val_score(knears_neighbors, X_train, y_train, cv=5)
print('Knears Neighbors Cross Validation Score', round(knears_score.mean() * 100, 2).astype(str) + '%')

svc_score = cross_val_score(svc, X_train, y_train, cv=5)
print('Support Vector Classifier Cross Validation Score', round(svc_score.mean() * 100, 2).astype(str) + '%')

# 4. Performing NearMiss Algorithm to deal with class imbalance

In [41]:
# We will undersample during cross validating
undersample_X = full_df.drop('Class', axis=1)
undersample_y = full_df['Class']

for train_index, test_index in sss.split(undersample_X, undersample_y):
    print("Train:", train_index, "Test:", test_index)
    undersample_Xtrain, undersample_Xtest = undersample_X.iloc[train_index], undersample_X.iloc[test_index]
    undersample_ytrain, undersample_ytest = undersample_y.iloc[train_index], undersample_y.iloc[test_index]
    
undersample_Xtrain = undersample_Xtrain.values
undersample_Xtest = undersample_Xtest.values
undersample_ytrain = undersample_ytrain.values
undersample_ytest = undersample_ytest.values 

undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []

# Implementing NearMiss Technique 
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
X_nearmiss, y_nearmiss = NearMiss().fit_resample(undersample_X.values, undersample_y.values)
print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating the right way

for train, test in sss.split(undersample_Xtrain, undersample_ytrain):
    undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg) # SMOTE happens during Cross Validation not before..
    undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train])
    undersample_prediction = undersample_model.predict(undersample_Xtrain[test])
    
    undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test]))
    undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction))
    undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction))
    undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction))
    undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction))

In [54]:
# Let's Plot LogisticRegression Learning Curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

def plot_learning_curve(name, estimator, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    f,ax = plt.subplots(1,1, figsize=(10,6), sharey=True)
    if ylim is not None:
        plt.ylim(*ylim)
    # First Estimator
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="#ff9124")
    ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="#2492ff")
    ax.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124",
             label="Training score")
    ax.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff",
             label="Cross-validation score")
    ax.set_title(name + " Learning Curve", fontsize=14)
    ax.set_xlabel('Training size (m)')
    ax.set_ylabel('Score')
    ax.grid(True)
    ax.legend(loc="best")
    
    return plt

In [55]:
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=42)
estimators = [log_reg, knears_neighbors, svc]
est_names = ['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine']
for name, estimator in zip(est_names,estimators):
    plot_learning_curve(name, estimator, X_train, y_train, (0.87, 1.01), cv=cv, n_jobs=4)

In [59]:
# Create a DataFrame with all the scores and the classifiers names.

log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5,
                             method="decision_function")

knears_pred = cross_val_predict(knears_neighbors, X_train, y_train, cv=5)

svc_pred = cross_val_predict(svc, X_train, y_train, cv=5,
                             method="decision_function")

print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))
print('KNears Neighbors: ', roc_auc_score(y_train, knears_pred))
print('Support Vector Classifier: ', roc_auc_score(y_train, svc_pred))

In [63]:
log_fpr, log_tpr, log_thresold = roc_curve(y_train, log_reg_pred)
knear_fpr, knear_tpr, knear_threshold = roc_curve(y_train, knears_pred)
svc_fpr, svc_tpr, svc_threshold = roc_curve(y_train, svc_pred)

def graph_roc_curve_multiple(log_fpr, log_tpr, knear_fpr, knear_tpr, svc_fpr, svc_tpr):
    plt.figure(figsize=(14,6))
    plt.title('ROC Curve \n Top 3 Classifiers', fontsize=18)
    plt.plot(log_fpr, log_tpr, label='Logistic Regression Classifier Score: {:.4f}'.format(roc_auc_score(y_train, log_reg_pred)))
    plt.plot(knear_fpr, knear_tpr, label='KNears Neighbors Classifier Score: {:.4f}'.format(roc_auc_score(y_train, knears_pred)))
    plt.plot(svc_fpr, svc_tpr, label='Support Vector Classifier Score: {:.4f}'.format(roc_auc_score(y_train, svc_pred)))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.legend()
    
graph_roc_curve_multiple(log_fpr, log_tpr, knear_fpr, knear_tpr, svc_fpr, svc_tpr)
plt.show()

# 5. Performing Oversampling using SMOTE

In [64]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV


print('Length of X (train): {} | Length of y (train): {}'.format(len(original_Xtrain), len(original_ytrain)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(original_Xtest), len(original_ytest)))

# List to append the score and then find the average
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []

# Classifier with optimal parameters
# log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm = LogisticRegression()

rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)

# Implementing SMOTE Technique 
# Cross Validating the right way
# Parameters
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
for train, test in sss.split(original_Xtrain, original_ytrain):
    pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg) # SMOTE happens during Cross Validation not before..
    model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
    best_est = rand_log_reg.best_estimator_
    prediction = best_est.predict(original_Xtrain[test])
    
    accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
    precision_lst.append(precision_score(original_ytrain[test], prediction))
    recall_lst.append(recall_score(original_ytrain[test], prediction))
    f1_lst.append(f1_score(original_ytrain[test], prediction))
    auc_lst.append(roc_auc_score(original_ytrain[test], prediction))
    
print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)

In [67]:
labels = ['No Fraud', 'Fraud']
smote_prediction = best_est.predict(original_Xtest)
print(classification_report(original_ytest, smote_prediction, target_names=labels))

y_score = best_est.decision_function(original_Xtest)

average_precision = average_precision_score(original_ytest, y_score)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))

In [70]:
fig = plt.figure(figsize=(12,6))

precision, recall, _ = precision_recall_curve(original_ytest, y_score)

plt.step(recall, precision, color='r', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
                 color='#F59B00')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('OverSampling Precision-Recall curve: \n Average Precision-Recall Score ={0:0.2f}'.format(
          average_precision), fontsize=16)

In [75]:
# SMOTE Technique (OverSampling) After splitting and Cross Validating
sm = SMOTE(sampling_strategy=0.6, random_state=42)
# Xsm_train, ysm_train = sm.fit_sample(X_train, y_train)

# This will be the data were we are going to 
Xsm_train, ysm_train = sm.fit_resample(original_Xtrain, original_ytrain)

# We Improve the score by 2% points approximately 
# Implement GridSearchCV and the other models.

# Logistic Regression
log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm.fit(Xsm_train, ysm_train)

In [80]:
from sklearn.metrics import confusion_matrix

# Logistic Regression fitted using SMOTE technique
y_pred_log_reg = log_reg_sm.predict(X_test)

# Other models fitted with UnderSampling
y_pred_knear = knears_neighbors.predict(X_test)
y_pred_svc = svc.predict(X_test)

log_reg_cf = confusion_matrix(y_test, y_pred_log_reg)
kneighbors_cf = confusion_matrix(y_test, y_pred_knear)
svc_cf = confusion_matrix(y_test, y_pred_svc)

fig, ax = plt.subplots(2, 2,figsize=(22,12))

sns.heatmap(log_reg_cf, ax=ax[0][0], annot=True, cmap=plt.cm.copper)
ax[0, 0].set_title("Logistic Regression \n Confusion Matrix", fontsize=14)
ax[0, 0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0, 0].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(kneighbors_cf, ax=ax[0][1], annot=True, cmap=plt.cm.copper)
ax[0][1].set_title("KNearsNeighbors \n Confusion Matrix", fontsize=14)
ax[0][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0][1].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(svc_cf, ax=ax[1][0], annot=True, cmap=plt.cm.copper)
ax[1][0].set_title("Suppor Vector Classifier \n Confusion Matrix", fontsize=14)
ax[1][0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][0].set_yticklabels(['', ''], fontsize=14, rotation=360)

plt.show()

In [82]:
from sklearn.metrics import classification_report


print('Logistic Regression:')
print(classification_report(y_test, y_pred_log_reg))

print('KNears Neighbors:')
print(classification_report(y_test, y_pred_knear))

print('Support Vector Classifier:')
print(classification_report(y_test, y_pred_svc))

In [83]:
# Final Score in the test set of logistic regression
from sklearn.metrics import accuracy_score

# Logistic Regression with Under-Sampling
y_pred = log_reg.predict(X_test)
undersample_score = accuracy_score(y_test, y_pred)



# Logistic Regression with SMOTE Technique (Better accuracy with SMOTE t)
y_pred_sm = best_est.predict(original_Xtest)
oversample_score = accuracy_score(original_ytest, y_pred_sm)


d = {'Technique': ['Random UnderSampling', 'Oversampling (SMOTE)'], 'Score': [undersample_score, oversample_score]}
final_df = pd.DataFrame(data=d)

# Move column
score = final_df['Score']
final_df.drop('Score', axis=1, inplace=True)
final_df.insert(1, 'Score', score)

# Note how high is accuracy score it can be misleading! 
final_df

# 9. Testing the Models