# **Santander Bank Customer Satisfaction - Classification**


In [None]:
!pip install feature_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature_engine
  Downloading feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
[K     |████████████████████████████████| 260 kB 14.9 MB/s 
Collecting statsmodels>=0.11.1
  Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 42.6 MB/s 
Installing collected packages: statsmodels, feature-engine
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.10.2
    Uninstalling statsmodels-0.10.2:
      Successfully uninstalled statsmodels-0.10.2
Successfully installed feature-engine-1.3.0 statsmodels-0.13.2


In [None]:
# Import the libraries, please only use the libraries imported below
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score,classification_report, roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
import pickle
import joblib
from sklearn.inspection import permutation_importance
from feature_engine.selection import DropDuplicateFeatures, DropConstantFeatures
import warnings
warnings.filterwarnings('ignore')

# **1. Download the Dataset from Shared Google Drive ID's**
1.1 Google Drive ID for Train Data is given below and execute the following command to download the data from Google Drive


```
!gdown 1_NPqplpJLHl28cbAhuL2k6jJObJF9wwU
```
1.2 To download Test Data execute the following command


```
!gdown 1LdvVrsrWLNO-UAJf0Z9S-SxnV1nozcsu
```

**Important:** Please note that **!** represents the linux commands being executed on the drive. Don't use **!** while using at your system. 

In [None]:
!gdown 1_NPqplpJLHl28cbAhuL2k6jJObJF9wwU

Downloading...
From: https://drive.google.com/uc?id=1_NPqplpJLHl28cbAhuL2k6jJObJF9wwU
To: /content/train.csv
100% 59.4M/59.4M [00:00<00:00, 280MB/s]


In [None]:
!gdown 1LdvVrsrWLNO-UAJf0Z9S-SxnV1nozcsu

Downloading...
From: https://drive.google.com/uc?id=1LdvVrsrWLNO-UAJf0Z9S-SxnV1nozcsu
To: /content/test.csv
  0% 0.00/59.1M [00:00<?, ?B/s] 42% 24.6M/59.1M [00:00<00:00, 245MB/s]100% 59.1M/59.1M [00:00<00:00, 322MB/s]


# **2. Load and Preprocess the Data**

Load the train and test dataset and Preprocess train data with the following objectives.

> 1. have zero variance

> 2. duplicated column

> 3. very sparse features






In [None]:
df = pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = df.drop('TARGET', axis = 1)
x = scaler.fit_transform(x)
y = df['TARGET']

In [None]:
const = DropConstantFeatures(tol=0.998)
x = const.fit_transform(x)

Dup = DropDuplicateFeatures()
x = Dup.fit_transform(x)

q_const = DropConstantFeatures(tol=0.99)
x = q_const.fit_transform(x)

In [None]:
x.shape

(76020, 143)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x ,y, train_size = 0.8, random_state = 41, stratify = y)

# **3. Training the Different Models**

Train different models with the following configurations and try to achieve maximum recall score or balanced precision/recall scores.

# 3.1 Simple Logistic Regression
Train simple logistic regression model and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.

In [None]:
# Your code here

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model is : {acc}')

Accuracy of the model is : 0.9600105235464351


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14602
           1       0.25      0.00      0.01       602

    accuracy                           0.96     15204
   macro avg       0.61      0.50      0.49     15204
weighted avg       0.93      0.96      0.94     15204



# 3.2 Train Vanilla kNN
Train simple kNN model and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.

In [None]:
# Your code here
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
acc = accuracy_score(y_test,y_pred)
print(f'Accuracy of the model is :{acc}')

Accuracy of the model is :0.9579058142594055


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14602
           1       0.24      0.03      0.05       602

    accuracy                           0.96     15204
   macro avg       0.60      0.51      0.51     15204
weighted avg       0.93      0.96      0.94     15204



# 3.3 Train kNN for k=2 and k=4 and evaluate it
Train kNN model with k values above and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.

In [None]:
# Your code here
from sklearn import neighbors
model = KNeighborsClassifier(n_neighbors = 2)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model is : {acc}')

Accuracy of the model is : 0.9572480926072087


In [None]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14602
           1       0.14      0.01      0.03       602

    accuracy                           0.96     15204
   macro avg       0.55      0.51      0.50     15204
weighted avg       0.93      0.96      0.94     15204



In [None]:
from sklearn import neighbors
model = KNeighborsClassifier(n_neighbors = 4)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model is : {acc}')

Accuracy of the model is : 0.9594843462246777


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14602
           1       0.29      0.02      0.03       602

    accuracy                           0.96     15204
   macro avg       0.63      0.51      0.51     15204
weighted avg       0.93      0.96      0.94     15204



# 3.4 Training a Tuned Logistic Regression Model with Upsampling using SMOTE
Train tuned Logistic Regression model with following configurations and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.



```
logreg_tuned_upsampled = LogisticRegression(max_iter=50000,class_weight='balanced',C=100,
                                         fit_intercept=True, penalty='l2',solver='newton-cg')
```

SMOTE stands for Synthetic Minority Oversampling Technique.

SMOTE first selects a minority class instance a at random and finds its k nearest minority class neighbors. The synthetic instance is then created by choosing one of the k nearest neighbors b at random and connecting a and b to form a line segment in the feature space. The synthetic instances are generated as a convex combination of the two chosen instances a and b

Credits : https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

Important: Please umsample the data before training this model.


In [None]:
# Your code here
from sklearn import metrics
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=1)
x_res, y_res = sm.fit_resample(x_train, y_train)

itu = LogisticRegression(max_iter=1000,class_weight='balanced',C=100, fit_intercept=True, penalty='l2',solver='newton-cg')

itu.fit(x_res, y_res)
y_pred = itu.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model is : {acc}')


In [None]:
print(classification_report(y_test,y_pred))
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)

# 3.5 Training a Tuned Logistic Regression Model with Class Weights
Train tuned Logistic Regression model with following configurations and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.



```
log_tuned_class_weight = LogisticRegression(max_iter=50000,class_weight={0:1,1:25},C=100,fit_intercept=True,
                               penalty='l2',solver='newton-cg')
```



In [None]:
# Your code here
log_tuned_class_weight = LogisticRegression(max_iter=1000,class_weight={0:1,1:25},C=100,fit_intercept=True,
                               penalty='l2',solver='newton-cg')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=69)
SM=SMOTE(random_state=1)
x_res,y_res=SM.fit_resample(x_train,y_train)
log_tuned_class_weight = LogisticRegression(max_iter=1000,class_weight={0:1,1:25},C=100,fit_intercept=True,
                               penalty='l2',solver='newton-cg')
log_tuned_class_weight.fit(x_res,y_res)
y_pred = log_tuned_class_weight.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model is : {acc}')

In [None]:
print(classification_report(y_test,y_pred))
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)

# 3.6 Training a Tuned Random Forest Model with Class Weights
Train Random Forest Classifier model with following configurations and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.


```
rfc_tuned = RandomForestClassifier(random_state=1, n_jobs=-1, class_weight={0:1, 1:25}, criterion='gini', max_depth= 6, min_samples_split= 12, n_estimators= 400, warm_start=True)

```



In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=69)
SM=SMOTE(random_state=1)
x_res,y_res=SM.fit_resample(x_train,y_train)
rfc_tuned = RandomForestClassifier(random_state=1, n_jobs=-1, class_weight={0:1, 1:25}, criterion='gini', max_depth= 6, min_samples_split= 12, n_estimators= 400, warm_start=True)

rfc_tuned.fit(x_res,y_res)
y_pred = rfc_tuned.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model is : {acc}')

In [None]:
print(classification_report(y_test,y_pred))
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)

# 3.5 Training a Tuned XGBoost Classifier Model with Class Weights
Train XGBoost Classifier model with following configurations and evaluate it on the validation set and print precision, recall, classification_report and ROC_AUC at the end.



```
# Lets train the XGBoost with best hyper parameters
# based on scoring='Recall'
# {'booster': 'dart', 'eta': 0.01, 'max_depth': 2, 'n_estimators': 150}
# These are the best parameters we got
```



```
xgb_tuned = XGBClassifier(scale_pos_weight = 25, eval_metric = 'logloss', seed =0, 
               objective='binary:logistic', 
              nthreads=-1, early_stopping_rounds=15, booster='dart', scoring='Recall',
              eta=0.01, max_depth=2, n_estimators=150)
```



In [None]:
# Your code here
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=69)
SM=SMOTE(random_state=1)
x_res,y_res=SM.fit_resample(x_train,y_train)
xgb_tuned = XGBClassifier(scale_pos_weight = 25, eval_metric = 'logloss', seed =0, 
               objective='binary:logistic', 
              nthreads=-1, early_stopping_rounds=15, booster='dart', scoring='Recall',
              eta=0.01, max_depth=2, n_estimators=150)
xgb_tuned.fit(x_res,y_res)
y_pred=xgb_tuned.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy of the model is : {acc}')

In [None]:
print(classification_report(y_test,y_pred))
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)

# **4. Plot the ROC-AUC Curves of all the models**

In [None]:
# Your code here



**Conclusion**

Conclusion in your own words here.