In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [38]:
dataset = pd.read_csv("online_shoppers_intention.csv")
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [39]:
dataset.isnull().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

In [40]:
dataset.duplicated().sum()

125

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [10,15,16])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [42]:
pd.set_option('display.max_columns', None)
pd.DataFrame(X).tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
12325,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,4,6,1,1
12326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0.0,0,0.0,5,465.75,0.0,0.021333,0.0,0.0,3,2,1,8
12327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0.0,0,0.0,6,184.25,0.083333,0.086667,0.0,0.0,3,2,1,13
12328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,4,75.0,0,0.0,15,346.0,0.0,0.021053,0.0,0.0,2,2,3,11
12329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,0.0,0,0.0,3,21.25,0.0,0.066667,0.0,0.0,3,2,1,2


In [43]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [44]:
pd.DataFrame(y).head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1,stratify=y)

In [10]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "solver": ["liblinear"],
        "penalty": ["l1", "l2"],
        "C": [0.01, 0.1, 1, 10]
    },
    {
        "solver": ["saga"],
        "penalty": ["l1", "l2"],
        "C": [0.01, 0.1, 1, 10]
    }
]

grid_logistic = GridSearchCV(
    LogisticRegression(max_iter=500, random_state=0),
    param_grid,
    scoring="f1",
    cv=10,
    n_jobs=-1
)

grid_logistic.fit(X_train, y_train)

print("Best Params:", grid_logistic.best_params_)
print("Best CV F1 Score:", grid_logistic.best_score_)


Best Params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV F1 Score: 0.8298380606990694


In [30]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score,average_precision_score
y_pred = grid_logistic.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

y_pred = grid_logistic.predict_proba(X_test)[:, 1]

pr_auc = average_precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)
print("PR-AUC Score:", pr_auc)

[[1801  283]
 [  92  290]]
              precision    recall  f1-score   support

           0       0.95      0.86      0.91      2084
           1       0.51      0.76      0.61       382

    accuracy                           0.85      2466
   macro avg       0.73      0.81      0.76      2466
weighted avg       0.88      0.85      0.86      2466

ROC-AUC Score: 0.9013764809920513
PR-AUC Score: 0.6568264149125179


## K Nearest Neighbors

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

params = {
    "n_neighbors": range(1, 21),
    "weights": ["uniform", "distance"],
    "p": [1, 2]
}

grid_knn = GridSearchCV(
    KNeighborsClassifier(metric="minkowski"),
    params,
    scoring="average_precision",
    cv=10,
    n_jobs=-1
)

grid_knn.fit(X_train, y_train)

print("Best Params:", grid_knn.best_params_)
print("Best CV PR Score:", grid_knn.best_score_)

Best Params: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best CV PR Score: 0.9785180241731488


In [15]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score,average_precision_score
y_pred = grid_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

y_pred = grid_knn.predict_proba(X_test)[:, 1]

pr_auc = average_precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)
print("PR-AUC Score:", pr_auc)

[[1962  122]
 [ 218  164]]
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      2084
           1       0.57      0.43      0.49       382

    accuracy                           0.86      2466
   macro avg       0.74      0.69      0.71      2466
weighted avg       0.85      0.86      0.85      2466

ROC-AUC Score: 0.8039048446905368
PR-AUC Score: 0.518999675548258


## Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {
    "criterion" : ["entropy","log_loss"],
    "max_depth": range(1, 21),
}

grid_tree = GridSearchCV(
    DecisionTreeClassifier(),
    params,
    scoring="average_precision",
    cv=10,
    n_jobs=-1
)

grid_tree.fit(X_train, y_train)

print("Best Params:", grid_tree.best_params_)
print("Best CV Precision Score:", grid_tree.best_score_)

Best Params: {'criterion': 'log_loss', 'max_depth': 7}
Best CV Precision Score: 0.9679355291170586


In [27]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score,average_precision_score
y_pred = grid_tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

y_pred = grid_tree.predict_proba(X_test)[:, 1]

pr_auc = average_precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)
print("PR-AUC Score:", pr_auc)

[[1916  168]
 [ 114  268]]
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      2084
           1       0.61      0.70      0.66       382

    accuracy                           0.89      2466
   macro avg       0.78      0.81      0.79      2466
weighted avg       0.89      0.89      0.89      2466

ROC-AUC Score: 0.9207964446141632
PR-AUC Score: 0.6809675107832998


## Random Forest

In [31]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

params = {
    "n_estimators": randint(150, 500),
    "max_depth": [None, 8, 10, 12, 15],
    "min_samples_split": randint(2, 30),
    "min_samples_leaf": randint(1, 10),
    "max_features": ["sqrt", "log2"],
    "class_weight": ["balanced"]
}

random_tree = RandomizedSearchCV(
    RandomForestClassifier(),
    params,
    n_iter=40,
    scoring="f1",
    cv=10,
    n_jobs=-1
)

random_tree.fit(X_train, y_train)

print("Best Params:", random_tree.best_params_)
print("Best PR-AUC:", random_tree.best_score_)

Best Params: {'class_weight': 'balanced', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 499}
Best PR-AUC: 0.9312732565413304


In [32]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score,average_precision_score
y_pred = random_tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

y_pred = random_tree.predict_proba(X_test)[:, 1]

pr_auc = average_precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)
print("PR-AUC Score:", pr_auc)

[[1969  115]
 [ 117  265]]
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      2084
           1       0.70      0.69      0.70       382

    accuracy                           0.91      2466
   macro avg       0.82      0.82      0.82      2466
weighted avg       0.91      0.91      0.91      2466

ROC-AUC Score: 0.9307940830661937
PR-AUC Score: 0.7454798612842854


## XGBoost

In [35]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(len(y_train[y_train==0]) / len(y_train[y_train==1])),
    eval_metric="aucpr",
    random_state=0
)

xgb.fit(X_train, y_train)

In [36]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score,average_precision_score
y_pred = xgb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

y_pred = xgb.predict_proba(X_test)[:, 1]

pr_auc = average_precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)
print("PR-AUC Score:", pr_auc)

[[1991   93]
 [ 137  245]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2084
           1       0.72      0.64      0.68       382

    accuracy                           0.91      2466
   macro avg       0.83      0.80      0.81      2466
weighted avg       0.90      0.91      0.90      2466

ROC-AUC Score: 0.9326519178784254
PR-AUC Score: 0.7667099047052607
