In [1]:
# pip install scikit-learn
# !pip install imblearn

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [5]:
target = ["Revenue"]

In [6]:
#from google.colab import files


#uploaded = files.upload()


In [7]:
# Load the data
file_path = Path('Resources/online_shoppers_intention.csv')
shoppers_df = pd.read_csv(file_path)

# Changed Revenue column to dtype str
shoppers_df['Revenue'] = shoppers_df['Revenue'].astype('str')

# Replace True and False in Column Revenue by Sale or No Sale for better readability
shoppers_df['Revenue'] = shoppers_df['Revenue'].replace(['True'],'Sale')
shoppers_df['Revenue'] = shoppers_df['Revenue'].replace(['False'],'No Sale')

shoppers_df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,No Sale
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,No Sale
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,No Sale
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,No Sale
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,No Sale


# Split the Data into Training and Testing

In [8]:
# Create our features
X = shoppers_df.drop("Revenue", axis=1)
X = pd.get_dummies(X)

# Create our target
y = shoppers_df.loc[:, target].copy()

In [9]:
X.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,Month_Jul,Month_June,Month_Mar,Month_May,Month_Nov,Month_Oct,Month_Sep,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,...,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,...,0.035036,0.023358,0.154663,0.27283,0.243147,0.044526,0.036334,0.137388,0.006894,0.855718
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,...,0.18388,0.151043,0.361598,0.445432,0.429,0.206268,0.187128,0.344271,0.082745,0.35139
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# Check the balance of our target values
y['Revenue'].value_counts()

No Sale    10422
Sale        1908
Name: Revenue, dtype: int64

In [11]:
# Normal train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(Counter(y_train['Revenue']))
print(Counter(y_test['Revenue']))

Counter({'No Sale': 7786, 'Sale': 1461})
Counter({'No Sale': 2636, 'Sale': 447})


# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [12]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
BRFC_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
BRFC_model.fit(X_train, y_train)
print(Counter(y_train['Revenue']))

Counter({'No Sale': 7786, 'Sale': 1461})


In [13]:
# Calculate predictions
y_pred = BRFC_model.predict(X_test)

In [14]:
# Calculated the balanced accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
balanced_accuracy_score(y_test,y_pred)

0.8583021016861695

In [15]:
# Display the confusion matrix
cmatrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cmatrix_df = pd.DataFrame(
    cmatrix, index=["Actual No Sale", "Actual Sale"], columns=["Predicted No Sale", "Predicted Sale"])

cmatrix_df

Unnamed: 0,Predicted No Sale,Predicted Sale
Actual No Sale,2231,405
Actual Sale,58,389


In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

    No Sale       0.97      0.85      0.87      0.91      0.86      0.73      2636
       Sale       0.49      0.87      0.85      0.63      0.86      0.74       447

avg / total       0.90      0.85      0.87      0.87      0.86      0.74      3083



In [17]:
# This will show the imbalanced classification report as a Pandas Dataframe for better readability
# Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
from sklearn import metrics
report = metrics.classification_report(y_test, y_pred, output_dict=True)
df_classification_report_imbalanced = pd.DataFrame(report).transpose()
df_classification_report_imbalanced = df_classification_report_imbalanced.sort_values(by=['f1-score'], ascending=False)

df_classification_report_imbalanced


Unnamed: 0,precision,recall,f1-score,support
No Sale,0.974661,0.846358,0.90599,2636.0
weighted avg,0.90438,0.849822,0.865527,3083.0
accuracy,0.849822,0.849822,0.849822,0.849822
macro avg,0.732293,0.858302,0.766452,3083.0
Sale,0.489924,0.870246,0.626914,447.0


In [18]:
# List the features sorted in descending order by feature importance
# First, set option to print entire DataFrame and not show "..." in middle to truncate:
pd.set_option("display.max_rows", None, "display.max_columns", None)

importances = pd.DataFrame(BRFC_model.feature_importances_, index = X_train.columns, columns=['Importance']).sort_values('Importance', ascending=False)
importances.loc[:,:]

Unnamed: 0,Importance
PageValues,0.381251
ExitRates,0.090502
ProductRelated_Duration,0.083199
ProductRelated,0.068103
BounceRates,0.05258
Administrative_Duration,0.051812
Administrative,0.038379
Month_Nov,0.030113
TrafficType,0.028559
Region,0.026794


### Easy Ensemble AdaBoost Classifier

In [19]:
# There was an error in previous runs that said: AttributeError: 'BalancedBaggingClassifier' object has no attribute 'n_features_in_'
# It seems that the attribute n_features_in_ is deprecated and its support was lost in sklearn version 1.2
# We need to downgrade the scikit-learn package using: pip install scikit-learn==1.0 -U to solve the issue
# Source of the fix: https://github.com/scikit-learn-contrib/imbalanced-learn/issues/872
# Direct link to the fix comment: https://github.com/scikit-learn-contrib/imbalanced-learn/issues/872#:~:text=You%20have%20to%20downgrade%20the%20scikit%2Dlearn%20package%20using%3A%20pip3%20install%20scikit%2Dlearn%3D%3D1.0%20%2DU%0AThe%20attribute%20n_features_in_%20is%20deprecated%20and%20its%20support%20was%20lost%20in%20sklearn%20version%201.2


In [20]:
# pip install scikit-learn==1.0 -U

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

In [23]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)  

0.8448646613222693

In [24]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = eec.predict(X_test)
cmatrix2 = confusion_matrix(y_test, y_pred)

cmatrix2

array([[3005,  431],
       [ 117,  516]], dtype=int64)

In [25]:
# This will show the confusion matrix as a Pandas Dataframe for better readability
cmatrix_df2 = pd.DataFrame(
    cmatrix2, index=["Actual No Sale", "Actual Sale"], columns=["Predicted No Sale", "Predicted Sale"])

cmatrix_df2

Unnamed: 0,Predicted No Sale,Predicted Sale
Actual No Sale,3005,431
Actual Sale,117,516


In [26]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

    No Sale       0.96      0.87      0.82      0.92      0.84      0.72      3436
       Sale       0.54      0.82      0.87      0.65      0.84      0.71       633

avg / total       0.90      0.87      0.82      0.88      0.84      0.72      4069



In [27]:
# This will show the classification report as a Pandas Dataframe for better readability
# Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
from sklearn import metrics
report2 = metrics.classification_report(y_test, y_pred, output_dict=True)
df_classification_report2 = pd.DataFrame(report2).transpose()
df_classification_report2 = df_classification_report2.sort_values(by=['f1-score'], ascending=False)

df_classification_report2


Unnamed: 0,precision,recall,f1-score,support
No Sale,0.962524,0.874563,0.916438,3436.0
weighted avg,0.897552,0.865323,0.875481,4069.0
accuracy,0.865323,0.865323,0.865323,0.865323
macro avg,0.753701,0.844865,0.784801,4069.0
Sale,0.544879,0.815166,0.653165,633.0
