<a href="https://colab.research.google.com/github/SwoopGT/Santander-Customer-Satisfaction/blob/master/Santander_Customer_Satisfaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SECTION 1 - PRE PROCESSING

### Import necessary Libraries

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,roc_auc_score,recall_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc, log_loss

%matplotlib inline

### Google Drive Pre Requisite

In [0]:
# Code to read csv file into Colaboratory:

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials 

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [0]:

link = 'https://drive.google.com/open?id=1A2RP-OBRhvd2DfISv_Ykhxp3ZJZVjZ3G'

fluff, id = link.split('=')

print (id) # Verify that you have everything after '='

### Read data from the file.

In [0]:

downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Train.csv')  
df = pd.read_csv('Train.csv') # Dataset is now stored in a Pandas Dataframe

### Check shape of the dataset

In [5]:
df.shape

(76020, 371)

### Check columns

In [6]:
df.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'TARGET'],
      dtype='object', length=371)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB


### Check for null values

In [8]:
percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing

ID                               0.0
var3                             0.0
var15                            0.0
imp_ent_var16_ult1               0.0
imp_op_var39_comer_ult1          0.0
imp_op_var39_comer_ult3          0.0
imp_op_var40_comer_ult1          0.0
imp_op_var40_comer_ult3          0.0
imp_op_var40_efect_ult1          0.0
imp_op_var40_efect_ult3          0.0
imp_op_var40_ult1                0.0
imp_op_var41_comer_ult1          0.0
imp_op_var41_comer_ult3          0.0
imp_op_var41_efect_ult1          0.0
imp_op_var41_efect_ult3          0.0
imp_op_var41_ult1                0.0
imp_op_var39_efect_ult1          0.0
imp_op_var39_efect_ult3          0.0
imp_op_var39_ult1                0.0
imp_sal_var16_ult1               0.0
ind_var1_0                       0.0
ind_var1                         0.0
ind_var2_0                       0.0
ind_var2                         0.0
ind_var5_0                       0.0
ind_var5                         0.0
ind_var6_0                       0.0
i

### Statistical Information

In [9]:
df.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,imp_op_var41_comer_ult1,imp_op_var41_comer_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,imp_op_var41_ult1,imp_op_var39_efect_ult1,imp_op_var39_efect_ult3,imp_op_var39_ult1,imp_sal_var16_ult1,ind_var1_0,ind_var1,ind_var2_0,ind_var2,ind_var5_0,ind_var5,ind_var6_0,ind_var6,ind_var8_0,ind_var8,ind_var12_0,ind_var12,ind_var13_0,ind_var13_corto_0,ind_var13_corto,ind_var13_largo_0,ind_var13_largo,ind_var13_medio_0,ind_var13_medio,ind_var13,...,saldo_medio_var5_ult1,saldo_medio_var5_ult3,saldo_medio_var8_hace2,saldo_medio_var8_hace3,saldo_medio_var8_ult1,saldo_medio_var8_ult3,saldo_medio_var12_hace2,saldo_medio_var12_hace3,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_hace2,saldo_medio_var13_corto_hace3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_hace3,saldo_medio_var13_largo_ult1,saldo_medio_var13_largo_ult3,saldo_medio_var13_medio_hace2,saldo_medio_var13_medio_hace3,saldo_medio_var13_medio_ult1,saldo_medio_var13_medio_ult3,saldo_medio_var17_hace2,saldo_medio_var17_hace3,saldo_medio_var17_ult1,saldo_medio_var17_ult3,saldo_medio_var29_hace2,saldo_medio_var29_hace3,saldo_medio_var29_ult1,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,3.160715,68.803937,113.056934,68.20514,113.225058,137.242763,68.618087,113.79241,140.403479,5.477676,0.011458,0.003762,0.0,0.0,0.958024,0.66376,0.000105,2.6e-05,0.032833,0.028598,0.067522,0.045462,0.052249,0.042936,0.041476,0.010168,0.009997,2.6e-05,2.6e-05,0.050855,...,1077.256756,1048.856447,68.275452,9.505287,124.620962,110.026575,3997.023,613.534443,5703.008,4401.002,3639.419939,556.184178,4852.261814,3857.848542,771.227449,162.170439,956.9502,750.9563,0.175324,0.0,0.513023,0.344174,91.17181,36.46318,131.0316,109.2169,0.213071,0.00191,0.253907,0.18663,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,95.268204,319.605516,512.154823,531.897917,950.086398,697.712596,535.47375,953.578624,712.76724,465.391149,0.106425,0.061221,0.0,0.0,0.200535,0.472425,0.010258,0.005129,0.178202,0.166674,0.250925,0.208316,0.222531,0.202714,0.19939,0.100325,0.099486,0.005129,0.005129,0.219703,...,9614.906985,8189.948852,1733.838226,519.389157,2205.249804,1935.305713,37773.14,9292.752726,46202.54,35507.18,26359.174223,7182.642532,31886.615189,25572.245055,13082.155867,4698.868075,16006.98,12422.52,34.625518,0.0,113.597559,73.376513,15392.48,8612.395,14956.53,13082.16,41.820444,0.526626,52.078775,31.879418,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-922.38,-476.07,-287.67,0.0,-3401.34,-1844.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,2.73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,90.0,83.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,8237.82,12888.03,16566.81,45990.0,131100.0,47598.09,45990.0,131100.0,47598.09,105000.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,601428.6,544365.57,231351.99,77586.21,228031.8,177582.0,3000538.0,668335.32,3004186.0,2272859.0,450000.0,304838.7,450000.0,450000.0,840000.0,534000.0,1500000.0,1034483.0,7741.95,0.0,30000.0,18870.99,4210084.0,2368559.0,3998687.0,3525777.0,10430.01,145.2,13793.67,7331.34,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


### Balance check

In [10]:
#Checking if the dataset is balanced or unbalanced
BalanceCheck = pd.DataFrame(df["TARGET"].value_counts())
BalanceCheck['Percentage'] = 100*BalanceCheck["TARGET"]/df.shape[0]
BalanceCheck

Unnamed: 0,TARGET,Percentage
0,73012,96.043147
1,3008,3.956853


### Checking the value counts for top 15 values

In [11]:
df["var3"].value_counts()[:15]

 2         74165
 8           138
-999999      116
 9           110
 3           108
 1           105
 13           98
 7            97
 4            86
 12           85
 6            82
 0            75
 10           72
 11           66
 5            63
Name: var3, dtype: int64

### Replacing the -999999 with most common occurance value - 2

In [0]:
df["var3"] = df["var3"].replace(-999999,2)

In [13]:
df.head(5)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,imp_op_var41_comer_ult1,imp_op_var41_comer_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,imp_op_var41_ult1,imp_op_var39_efect_ult1,imp_op_var39_efect_ult3,imp_op_var39_ult1,imp_sal_var16_ult1,ind_var1_0,ind_var1,ind_var2_0,ind_var2,ind_var5_0,ind_var5,ind_var6_0,ind_var6,ind_var8_0,ind_var8,ind_var12_0,ind_var12,ind_var13_0,ind_var13_corto_0,ind_var13_corto,ind_var13_largo_0,ind_var13_largo,ind_var13_medio_0,ind_var13_medio,ind_var13,...,saldo_medio_var5_ult1,saldo_medio_var5_ult3,saldo_medio_var8_hace2,saldo_medio_var8_hace3,saldo_medio_var8_ult1,saldo_medio_var8_ult3,saldo_medio_var12_hace2,saldo_medio_var12_hace3,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_hace2,saldo_medio_var13_corto_hace3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_hace3,saldo_medio_var13_largo_ult1,saldo_medio_var13_largo_ult3,saldo_medio_var13_medio_hace2,saldo_medio_var13_medio_hace3,saldo_medio_var13_medio_ult1,saldo_medio_var13_medio_ult3,saldo_medio_var17_hace2,saldo_medio_var17_hace3,saldo_medio_var17_ult1,saldo_medio_var17_ult3,saldo_medio_var29_hace2,saldo_medio_var29_hace3,saldo_medio_var29_ult1,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0,122.22,300.0,240.75,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3.0,2.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,0.0,195.0,195.0,0.0,0.0,195.0,0.0,0.0,195.0,0.0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,91.56,138.84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,...,40501.08,13501.47,0.0,0.0,0.0,0.0,0.0,0.0,85501.89,85501.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


### Define the independent and dependent variables

In [0]:
X = df.drop(["TARGET","ID"],axis=1)
y = df["TARGET"]

# SECTION 2 - MACHINE LEARNING - UNBALANCED DATASET

### Train Test Split

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, random_state = 42)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (60816, 369) (60816,)
Test set: (15204, 369) (15204,)


### Scale the data

In [0]:
sdc = StandardScaler()
X_train = sdc.fit_transform(X_train)
X_test = sdc.transform(X_test)

### Random Forest Classifier

In [17]:
# Fit and Predict
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred=rf.predict(X_test)

# Probability Prediction and ROC parameters
rf_pred_proba = rf.predict_proba(X_test)[:, 1]
[fpr_rf, tpr_rf, thr_rf] = roc_curve(y_test, rf_pred_proba)

# Metrics for evaluation
rf_accuracy_score = accuracy_score(y_test, rf_pred)
rf_log_loss = log_loss(y_test,rf_pred_proba)
rf_auc = auc(fpr_rf, tpr_rf)
rf_confusion_matrix = confusion_matrix(y_test,rf_pred )
rf_classification_report = classification_report(y_test, rf_pred)

# Results
print('Random Forest Results:\n')
print(" Accuracy Score" ,rf_accuracy_score*100)
print(" Log_loss", rf_log_loss )
print(" ROC AUC" ,rf_auc )
print("\n Confusion Matrix \n\n" ,rf_confusion_matrix)
print("\n Classification Report \n\n " ,rf_classification_report)



Random Forest Results:

 Accuracy Score 95.38937121810051
 Log_loss 0.7401019340496234
 ROC AUC 0.6701080732551058

 Confusion Matrix 

 [[14484   113]
 [  588    19]]

 Classification Report 

                precision    recall  f1-score   support

           0       0.96      0.99      0.98     14597
           1       0.14      0.03      0.05       607

    accuracy                           0.95     15204
   macro avg       0.55      0.51      0.51     15204
weighted avg       0.93      0.95      0.94     15204



### Precision and Recall

In [18]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test,rf_pred)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.96098726 0.14393939]
recall: [0.99225868 0.03130148]


### Frequency for Random Forest Prediction

In [19]:
unique_elements, counts_elements = np.unique(rf_pred, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [15072   132]]


### Frequency for test

In [20]:
unique_elements, counts_elements = np.unique(y_test, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [14597   607]]


### Gradient Boosting Classifier

In [21]:
# Fit and Predict
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)


# Probability Prediction and ROC parameters
xgb_pred_proba = xgb.predict_proba(X_test)[:, 1]
[fpr_xgb, tpr_xgb, thr_xgb] = roc_curve(y_test, xgb_pred_proba)

# Metrics for evaluation
xgb_accuracy_score = accuracy_score(y_test, xgb_pred)
xgb_log_loss = log_loss(y_test,xgb_pred_proba)
xgb_auc = auc(fpr_xgb, tpr_xgb)
xgb_confusion_matrix = confusion_matrix(y_test,xgb_pred )
xgb_classification_report = classification_report(y_test,xgb_pred)

# Results
print('XGBoost Results:\n')
print(" Accuracy Score" ,xgb_accuracy_score*100)
print(" Log_loss", xgb_log_loss )
print(" ROC AUC" ,xgb_auc )
print("\n Confusion Matrix \n\n" ,xgb_confusion_matrix)
print("\n Classification Report \n\n " ,xgb_classification_report)


XGBoost Results:

 Accuracy Score 96.00105235464352
 Log_loss 0.13563570622995474
 ROC AUC 0.8413028381743038

 Confusion Matrix 

 [[14596     1]
 [  607     0]]

 Classification Report 

                precision    recall  f1-score   support

           0       0.96      1.00      0.98     14597
           1       0.00      0.00      0.00       607

    accuracy                           0.96     15204
   macro avg       0.48      0.50      0.49     15204
weighted avg       0.92      0.96      0.94     15204



### Precision and Recall

In [22]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test,xgb_pred)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.96007367 0.        ]
recall: [0.99993149 0.        ]


### Frequency for Gradient Boosting Prediction

In [23]:
unique_elements, counts_elements = np.unique(xgb_pred, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [15203     1]]


### Frequency for test

In [24]:
unique_elements, counts_elements = np.unique(y_test, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [14597   607]]


# SECTION 3 - MACHINE LEARNING - BALANCED DATASET

### Balancing the dataset

In [0]:
df_b = df.sample(frac=1).groupby(df['TARGET'], sort=False).head(3008)

In [26]:
df_b.shape

(6016, 371)

In [27]:
#Checking if the dataset is balanced or unbalanced
BalanceCheck2 = pd.DataFrame(df_b["TARGET"].value_counts())
BalanceCheck2['Percentage'] = 100*BalanceCheck2["TARGET"]/df_b.shape[0]
BalanceCheck2

Unnamed: 0,TARGET,Percentage
1,3008,50.0
0,3008,50.0


### Cleaning the dataset

In [28]:
#Checking the value counts for top 15 values

df_b["var3"].value_counts()[:15]

2     5892
1       16
7       10
0        9
8        9
3        9
12       7
14       6
4        6
9        6
10       5
6        5
11       5
5        4
13       3
Name: var3, dtype: int64

### Define the independent and dependent variables

In [0]:
X_b = df_b.drop(["TARGET","ID"],axis=1)
y_b = df_b["TARGET"]

### Train Test Split

In [30]:
from sklearn.model_selection import train_test_split
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split( X_b, y_b, test_size = 0.2, random_state = 42)
print ('Train set:', X_train_b.shape,  y_train_b.shape)
print ('Test set:', X_test_b.shape,  y_test_b.shape)

Train set: (4812, 369) (4812,)
Test set: (1204, 369) (1204,)


### Scale the data

In [0]:
sdc = StandardScaler()
X_train_b = sdc.fit_transform(X_train_b)
X_test_b = sdc.transform(X_test_b)

### Random Forest Classifier

In [32]:
# Fit and Predict
rf_b = RandomForestClassifier(n_estimators = 400)
rf_b.fit(X_train_b,y_train_b)
rf_pred_b = rf_b.predict(X_test_b)
rf_pred_imb = rf_b.predict(X_test)

# Probability Prediction and ROC parameters
rf_pred_proba_b = rf_b.predict_proba(X_test_b)[:, 1]
[fpr_rf_b, tpr_rf_b, thr_rf_b] = roc_curve(y_test_b, rf_pred_proba_b)

# Probability Prediction and ROC parameters for imbalanced testing
rf_pred_proba_imb = rf_b.predict_proba(X_test)[:, 1]
[fpr_rf_imb, tpr_rf_imb, thr_rf_imb] = roc_curve(y_test, rf_pred_proba_imb)

# Metrics for evaluation
rf_accuracy_score_b = accuracy_score(y_test_b, rf_pred_b)
rf_log_loss_b = log_loss(y_test_b,rf_pred_proba_b)
rf_auc_b = auc(fpr_rf_b, tpr_rf_b)
rf_confusion_matrix_b = confusion_matrix(y_test_b,rf_pred_b )
rf_classification_report_b = classification_report(y_test_b, rf_pred_b)

# Metrics for evaluation for imbalanced testing
rf_accuracy_score_imb = accuracy_score(y_test, rf_pred_imb)
rf_log_loss_imb = log_loss(y_test,rf_pred_proba_imb)
rf_auc_imb = auc(fpr_rf_imb, tpr_rf_imb)
rf_confusion_matrix_imb = confusion_matrix(y_test,rf_pred_imb )
rf_classification_report_imb = classification_report(y_test, rf_pred_imb)

# Results for Balanced Dataset and X_test balanced
print('Random Forest Results:\n')
print(" Accuracy Score" ,rf_accuracy_score_b*100)
print(" Log_loss", rf_log_loss_b )
print(" ROC AUC" ,rf_auc_b )
print("\n Confusion Matrix \n\n" ,rf_confusion_matrix_b)
print("\n Classification Report \n\n " ,rf_classification_report_b)

# Results for prediction on X_test imbalanced
print('Random Forest Results (imbalanced prediction) :\n')
print(" Accuracy Score" ,rf_accuracy_score_imb*100)
print(" Log_loss", rf_log_loss_imb )
print(" ROC AUC" ,rf_auc_imb )
print("\n Confusion Matrix \n\n" ,rf_confusion_matrix_imb)
print("\n Classification Report \n\n " ,rf_classification_report_imb)

Random Forest Results:

 Accuracy Score 73.00664451827242
 Log_loss 0.6721716788410981
 ROC AUC 0.7993944818289301

 Confusion Matrix 

 [[454 168]
 [157 425]]

 Classification Report 

                precision    recall  f1-score   support

           0       0.74      0.73      0.74       622
           1       0.72      0.73      0.72       582

    accuracy                           0.73      1204
   macro avg       0.73      0.73      0.73      1204
weighted avg       0.73      0.73      0.73      1204

Random Forest Results (imbalanced prediction) :

 Accuracy Score 40.19994738226782
 Log_loss 0.8563923045983609
 ROC AUC 0.8042282389951942

 Confusion Matrix 

 [[5536 9061]
 [  31  576]]

 Classification Report 

                precision    recall  f1-score   support

           0       0.99      0.38      0.55     14597
           1       0.06      0.95      0.11       607

    accuracy                           0.40     15204
   macro avg       0.53      0.66      0.33     15

### Precision and Recall for balanced dataset

In [33]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test_b,rf_pred_b)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.74304419 0.71669477]
recall: [0.72990354 0.73024055]


### Frequency for Random Forest balanced dataset Prediction

In [34]:
unique_elements, counts_elements = np.unique(rf_pred_b, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  0   1]
 [611 593]]


### Frequency for test balanced

In [35]:
unique_elements, counts_elements = np.unique(y_test_b, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  0   1]
 [622 582]]


### Precision and Recall for imbalanced dataset

In [36]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test,rf_pred_imb)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.99443147 0.05976964]
recall: [0.37925601 0.94892916]


### Frequency for Random Forest imbalanced dataset Prediction

In [37]:
unique_elements, counts_elements = np.unique(rf_pred_imb, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[   0    1]
 [5567 9637]]


### Frequency for test

In [38]:
unique_elements, counts_elements = np.unique(y_test, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [14597   607]]


### Random Forest Hyperparameter Tuning

### Randomized Search Cross Validation

In [39]:
# Define the parameter grid
param_grid = { 'n_estimators': [100,200,300,400,500,600,700,800,900,1000],'max_features': ['auto', 'sqrt', 'log2']}

forest = RandomizedSearchCV(rf_b, param_distributions = param_grid, n_iter =5, scoring = 'roc_auc', n_jobs = -1, cv = 10, verbose = 3)

forest.fit(X_train_b,y_train_b)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  5.2min finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=400,
                                                    n_jobs=None,


### Best parameters and best score

In [40]:
print(forest.best_params_)
print(forest.best_score_)

{'n_estimators': 1000, 'max_features': 'sqrt'}
0.7959790211505344


### Retraining the model using the obtained parameters

In [41]:
# Fit and Predict
rf_b = RandomForestClassifier(n_estimators=forest.best_params_["n_estimators"],max_features = forest.best_params_['max_features'])
rf_b.fit(X_train_b,y_train_b)
rf_pred_b = rf_b.predict(X_test_b)
rf_pred_imb = rf_b.predict(X_test)

# Probability Prediction and ROC parameters
rf_pred_proba_b = rf_b.predict_proba(X_test_b)[:, 1]
[fpr_rf_b, tpr_rf_b, thr_rf_b] = roc_curve(y_test_b, rf_pred_proba_b)

# Probability Prediction and ROC parameters for imbalanced testing
rf_pred_proba_imb = rf_b.predict_proba(X_test)[:, 1]
[fpr_rf_imb, tpr_rf_imb, thr_rf_imb] = roc_curve(y_test, rf_pred_proba_imb)

# Metrics for evaluation
rf_accuracy_score_b = accuracy_score(y_test_b, rf_pred_b)
rf_log_loss_b = log_loss(y_test_b,rf_pred_proba_b)
rf_auc_b = auc(fpr_rf_b, tpr_rf_b)
rf_confusion_matrix_b = confusion_matrix(y_test_b,rf_pred_b )
rf_classification_report_b = classification_report(y_test_b, rf_pred_b)

# Metrics for evaluation for imbalanced testing
rf_accuracy_score_imb = accuracy_score(y_test, rf_pred_imb)
rf_log_loss_imb = log_loss(y_test,rf_pred_proba_imb)
rf_auc_imb = auc(fpr_rf_imb, tpr_rf_imb)
rf_confusion_matrix_imb = confusion_matrix(y_test,rf_pred_imb )
rf_classification_report_imb = classification_report(y_test, rf_pred_imb)

# Results for Balanced Dataset and X_test balanced
print('Random Forest Results:\n')
print(" Accuracy Score" ,rf_accuracy_score_b*100)
print(" Log_loss", rf_log_loss_b )
print(" ROC AUC" ,rf_auc_b )
print("\n Confusion Matrix \n\n" ,rf_confusion_matrix_b)
print("\n Classification Report \n\n " ,rf_classification_report_b)

# Results for prediction on X_test imbalanced
print('Random Forest Results (imbalanced prediction) :\n')
print(" Accuracy Score" ,rf_accuracy_score_imb*100)
print(" Log_loss", rf_log_loss_imb )
print(" ROC AUC" ,rf_auc_imb )
print("\n Confusion Matrix \n\n" ,rf_confusion_matrix_imb)
print("\n Classification Report \n\n " ,rf_classification_report_imb)


Random Forest Results:

 Accuracy Score 72.75747508305648
 Log_loss 0.64882573332456
 ROC AUC 0.7986265345134308

 Confusion Matrix 

 [[447 175]
 [153 429]]

 Classification Report 

                precision    recall  f1-score   support

           0       0.74      0.72      0.73       622
           1       0.71      0.74      0.72       582

    accuracy                           0.73      1204
   macro avg       0.73      0.73      0.73      1204
weighted avg       0.73      0.73      0.73      1204

Random Forest Results (imbalanced prediction) :

 Accuracy Score 42.5414364640884
 Log_loss 0.835278958758575
 ROC AUC 0.8035135404478747

 Confusion Matrix 

 [[5893 8704]
 [  32  575]]

 Classification Report 

                precision    recall  f1-score   support

           0       0.99      0.40      0.57     14597
           1       0.06      0.95      0.12       607

    accuracy                           0.43     15204
   macro avg       0.53      0.68      0.35     15204


### Precision and Recall for balanced dataset

In [42]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test_b,rf_pred_b)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.745     0.7102649]
recall: [0.71864952 0.7371134 ]


### Frequency for Random Forest balanced dataset Prediction

In [43]:
unique_elements, counts_elements = np.unique(rf_pred_b, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  0   1]
 [600 604]]


### Frequency for test balanced

In [44]:
unique_elements, counts_elements = np.unique(y_test_b, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  0   1]
 [622 582]]


### Precision and Recall for imbalanced dataset

In [45]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test,rf_pred_imb)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.99459916 0.06196788]
recall: [0.40371309 0.94728171]


### Frequency for Random Forest imbalanced dataset Prediction

In [46]:
unique_elements, counts_elements = np.unique(rf_pred_imb, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[   0    1]
 [5925 9279]]


### Frequency for test

In [47]:
unique_elements, counts_elements = np.unique(y_test, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [14597   607]]


## Gradient Boosting Classifier

In [48]:
# Fit and Predict
xgb_b = XGBClassifier()
xgb_b.fit(X_train_b,y_train_b)
xgb_pred_b = xgb_b.predict(X_test_b)
xgb_pred_imb = xgb_b.predict(X_test)

# Probability Prediction and ROC parameters
xgb_pred_proba_b = xgb_b.predict_proba(X_test_b)[:, 1]
[fpr_xgb_b, tpr_xgb_b, thr_xgb_b] = roc_curve(y_test_b, xgb_pred_proba_b)

# Probability Prediction and ROC parameters for imbalanced testing
xgb_pred_proba_imb = xgb_b.predict_proba(X_test)[:, 1]
[fpr_xgb_imb, tpr_xgb_imb, thr_xgb_imb] = roc_curve(y_test, xgb_pred_proba_imb)

# Metrics for evaluation
xgb_accuracy_score_b = accuracy_score(y_test_b, xgb_pred_b)
xgb_log_loss_b = log_loss(y_test_b,xgb_pred_proba_b)
xgb_auc_b = auc(fpr_xgb_b, tpr_xgb_b)
xgb_confusion_matrix_b = confusion_matrix(y_test_b,xgb_pred_b )
xgb_classification_report_b = classification_report(y_test_b, xgb_pred_b)

# Metrics for evaluation for imbalanced testing
xgb_accuracy_score_imb = accuracy_score(y_test, xgb_pred_imb)
xgb_log_loss_imb = log_loss(y_test,xgb_pred_proba_imb)
xgb_auc_imb = auc(fpr_xgb_imb, tpr_xgb_imb)
xgb_confusion_matrix_imb = confusion_matrix(y_test,xgb_pred_imb )
xgb_classification_report_imb = classification_report(y_test, xgb_pred_imb)

# Results for Balanced Dataset and X_test balanced
print('Gradient Boosting Results:\n')
print(" Accuracy Score" ,xgb_accuracy_score_b*100)
print(" Log_loss", xgb_log_loss_b )
print(" ROC AUC" ,xgb_auc_b )
print("\n Confusion Matrix \n\n" ,xgb_confusion_matrix_b)
print("\n Classification Report \n\n " ,xgb_classification_report_b)

# Results for prediction on X_test imbalanced
print('Gradient Boosting Results (imbalanced prediction) :\n')
print(" Accuracy Score" ,xgb_accuracy_score_imb*100)
print(" Log_loss", xgb_log_loss_imb )
print(" ROC AUC" ,xgb_auc_imb )
print("\n Confusion Matrix \n\n" ,xgb_confusion_matrix_imb)
print("\n Classification Report \n\n " ,xgb_classification_report_imb)

Gradient Boosting Results:

 Accuracy Score 75.24916943521595
 Log_loss 0.5047831352988871
 ROC AUC 0.8340805626457166

 Confusion Matrix 

 [[451 171]
 [127 455]]

 Classification Report 

                precision    recall  f1-score   support

           0       0.78      0.73      0.75       622
           1       0.73      0.78      0.75       582

    accuracy                           0.75      1204
   macro avg       0.75      0.75      0.75      1204
weighted avg       0.75      0.75      0.75      1204

Gradient Boosting Results (imbalanced prediction) :

 Accuracy Score 21.823204419889503
 Log_loss 1.235558713149251
 ROC AUC 0.7884049880936246

 Confusion Matrix 

 [[ 2730 11867]
 [   19   588]]

 Classification Report 

                precision    recall  f1-score   support

           0       0.99      0.19      0.31     14597
           1       0.05      0.97      0.09       607

    accuracy                           0.22     15204
   macro avg       0.52      0.58     

### Precision and Recall for balanced dataset

In [49]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test_b,xgb_pred_b)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.78027682 0.72683706]
recall: [0.72508039 0.78178694]


### Frequency for Gradient Boosting balanced dataset Prediction

In [50]:
unique_elements, counts_elements = np.unique(xgb_pred_b, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  0   1]
 [578 626]]


### Frequency for test balanced

In [51]:
unique_elements, counts_elements = np.unique(y_test, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [14597   607]]


### Precision and Recall for imbalanced dataset

In [52]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test,xgb_pred_imb)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.9930884  0.04720996]
recall: [0.18702473 0.96869852]


### Frequency for Gradient Boosting imbalanced dataset Prediction

In [53]:
unique_elements, counts_elements = np.unique(xgb_pred_imb, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [ 2749 12455]]


### Frequency for test

In [54]:
unique_elements, counts_elements = np.unique(y_test, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [14597   607]]


### Gradient Boosting Hyperparameter Tuning

### Randomized Search Cross Validation

In [55]:
# Define the parameter grid
param_grid = {
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]   
}

boost = RandomizedSearchCV(xgb_b, param_distributions = param_grid, n_iter = 5, scoring = 'roc_auc', n_jobs = -1, cv = 10, verbose = 3)

boost.fit(X_train_b,y_train_b)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.2min finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=...
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid='warn', n_iter=5, n_jobs=-1,
                   param_distribu

### Best parameters and best score

In [56]:
print(boost.best_params_)
print(boost.best_score_)

{'min_child_weight': 3, 'max_depth': 8, 'learning_rate': 0.1, 'gamma': 0.4, 'colsample_bytree': 0.5}
0.8249181719917908


In [57]:
# Fit and Predict
xgb_b = XGBClassifier(min_child_weight = boost.best_params_["min_child_weight"], max_depth = boost.best_params_["max_depth"], learning_rate = boost.best_params_["learning_rate"], gamma = boost.best_params_["gamma"], colsample_bytree = boost.best_params_["colsample_bytree"],n_jobs = -1)
xgb_b.fit(X_train_b,y_train_b)
xgb_pred_b = xgb_b.predict(X_test_b)
xgb_pred_imb = xgb_b.predict(X_test)

# Probability Prediction and ROC parameters
xgb_pred_proba_b = xgb_b.predict_proba(X_test_b)[:, 1]
[fpr_xgb_b, tpr_xgb_b, thr_xgb_b] = roc_curve(y_test_b, xgb_pred_proba_b)

# Probability Prediction and ROC parameters for imbalanced testing
xgb_pred_proba_imb = xgb_b.predict_proba(X_test)[:, 1]
[fpr_xgb_imb, tpr_xgb_imb, thr_xgb_imb] = roc_curve(y_test, xgb_pred_proba_imb)

# Metrics for evaluation
xgb_accuracy_score_b = accuracy_score(y_test_b, xgb_pred_b)
xgb_log_loss_b = log_loss(y_test_b,xgb_pred_proba_b)
xgb_auc_b = auc(fpr_xgb_b, tpr_xgb_b)
xgb_confusion_matrix_b = confusion_matrix(y_test_b,xgb_pred_b )
xgb_classification_report_b = classification_report(y_test_b, xgb_pred_b)

# Metrics for evaluation for imbalanced testing
xgb_accuracy_score_imb = accuracy_score(y_test, xgb_pred_imb)
xgb_log_loss_imb = log_loss(y_test,xgb_pred_proba_imb)
xgb_auc_imb = auc(fpr_xgb_imb, tpr_xgb_imb)
xgb_confusion_matrix_imb = confusion_matrix(y_test,xgb_pred_imb )
xgb_classification_report_imb = classification_report(y_test, xgb_pred_imb)

# Results for Balanced Dataset and X_test balanced
print('Gradient Boosting Results:\n')
print(" Accuracy Score" ,xgb_accuracy_score_b*100)
print(" Log_loss", xgb_log_loss_b )
print(" ROC AUC" ,xgb_auc_b )
print("\n Confusion Matrix \n\n" ,xgb_confusion_matrix_b)
print("\n Classification Report \n\n " ,xgb_classification_report_b)

# Results for prediction on X_test imbalanced
print('Gradient Boosting Results (imbalanced prediction) :\n')
print(" Accuracy Score" ,xgb_accuracy_score_imb*100)
print(" Log_loss", xgb_log_loss_imb )
print(" ROC AUC" ,xgb_auc_imb )
print("\n Confusion Matrix \n\n" ,xgb_confusion_matrix_imb)
print("\n Classification Report \n\n " ,xgb_classification_report_imb)

Gradient Boosting Results:

 Accuracy Score 73.9202657807309
 Log_loss 0.5274807143750665
 ROC AUC 0.8182823946696722

 Confusion Matrix 

 [[440 182]
 [132 450]]

 Classification Report 

                precision    recall  f1-score   support

           0       0.77      0.71      0.74       622
           1       0.71      0.77      0.74       582

    accuracy                           0.74      1204
   macro avg       0.74      0.74      0.74      1204
weighted avg       0.74      0.74      0.74      1204

Gradient Boosting Results (imbalanced prediction) :

 Accuracy Score 38.943699026571956
 Log_loss 0.977826366381715
 ROC AUC 0.8013597386748353

 Confusion Matrix 

 [[5340 9257]
 [  26  581]]

 Classification Report 

                precision    recall  f1-score   support

           0       1.00      0.37      0.53     14597
           1       0.06      0.96      0.11       607

    accuracy                           0.39     15204
   macro avg       0.53      0.66      0.32

### Precision and Recall for balanced dataset

In [58]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test_b,xgb_pred_b)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.76923077 0.71202532]
recall: [0.7073955  0.77319588]


### Frequency for Gradient Boosting balanced dataset Prediction

In [59]:
unique_elements, counts_elements = np.unique(xgb_pred_b, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  0   1]
 [572 632]]


### Frequency for test balanced

In [60]:
unique_elements, counts_elements = np.unique(y_test, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [14597   607]]


### Precision and Recall for imbalanced dataset

In [61]:
from sklearn.metrics import precision_recall_fscore_support as score # 
precision, recall, fscore, support = score(y_test,xgb_pred_imb)  # 
print('precision: {}'.format(precision)) 
print('recall: {}'.format(recall))

precision: [0.99515468 0.05905672]
recall: [0.36582859 0.95716639]


### Frequency for Gradient Boosting imbalanced dataset Prediction

In [62]:
unique_elements, counts_elements = np.unique(xgb_pred_imb, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[   0    1]
 [5366 9838]]


### Frequency for test

In [63]:
unique_elements, counts_elements = np.unique(y_test, return_counts=True) 
print("Frequency of unique values of the said array:") 
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1]
 [14597   607]]
