Data Set Information:
The dataset consists of feature vectors belonging to 12,330 sessions. The dataset was formed so that each session would belong to a different user in a 1-year period to avoid any tendency to a specific campaign, special day, user profile, or period.

In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
PATH = 'dataset/'

In [None]:
dataset = pd.read_csv(PATH + 'dataset.csv')

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.isna().sum()

In [None]:
dataset[dataset.isna().any(axis=1)].head()

In [None]:
dataset_no_na = dataset.fillna(0)
dataset_no_na.isna().sum()

In [None]:
sns.countplot(dataset_no_na['Revenue'])

In [None]:
dataset_no_na['Revenue'].value_counts()

In [None]:
def plot_countplot(features, data, univariate=True, target=None):
    fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(20, 20))
    i = 0
    
    if univariate:
        for feature in features:
            i += 1
            plt.subplot(3, 3, i)
            sns.countplot(data[feature])    
    else:
        for feature in features:
            i += 1
            plt.subplot(3, 3, i)
            sns.countplot(data[feature], hue=dataset_no_na[target])   

In [None]:
cat = dataset_no_na.columns[9:-1]

In [None]:
plot_countplot(data=dataset_no_na, features=cat, univariate=True)

In [None]:
plot_countplot(data=dataset_no_na, features=cat, univariate=False, target='Revenue')

In [None]:
def plot_distplot(features, data, univariate=True, target=None):
    fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(20, 20))
    i = 0
    
    if univariate:
        for feature in features:
            i += 1
            plt.subplot(3, 3, i)
            plt.title(feature)
            data[feature].plot(kind='kde')
    else:        
        for feature in features:
            i += 1
            plt.subplot(3, 3, i)
            plt.title(feature)
            data.groupby(target)[feature].plot(kind='kde', legend=True)

In [None]:
cont = dataset_no_na.columns[:9]

In [None]:
plot_distplot(data=dataset_no_na, features=cont, univariate=True)

In [None]:
plot_distplot(data=dataset_no_na, features=cont, target='Revenue', univariate=False)

In [None]:
sns.heatmap(data=dataset_no_na.iloc[:, :9].corr(), annot=True)

In [None]:
removed_features = ['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration', 'BounceRates']
filtered_dataset = dataset_no_na.drop(removed_features, axis=1)

In [None]:
dumm_dataset = pd.get_dummies(data=filtered_dataset, columns=filtered_dataset.columns[5:-1])
dumm_dataset.info()

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import numpy as np

In [None]:
x_train_init, x_test_init, y_train, y_test = train_test_split(dumm_dataset.drop('Revenue', axis=1), dumm_dataset[['Revenue']], test_size=0.9)

In [None]:
scaler = MinMaxScaler()

In [None]:
scaled_train_data = scaler.fit_transform(x_train_init.iloc[:,:5])
x_train = np.concatenate([scaled_train_data, x_train_init.iloc[:,6:].to_numpy()], axis=1)

In [None]:
scaled_test_data = scaler.transform(x_test_init.iloc[:,:5])
x_test = np.concatenate([scaled_test_data, x_test_init.iloc[:,6:].to_numpy()], axis=1)

In [None]:
clfs = [LogisticRegression(), GradientBoostingClassifier(), RandomForestClassifier(), GaussianNB(), SVC()]

In [None]:
kfold = KFold(n_splits=10)

In [None]:
models = {type(clf).__name__:[(clf
       .fit(x_train[train, :], y_train.iloc[train, 0]), 
       clf
       .fit(x_train[train, :], y_train.iloc[train, 0])
       .score(x_train[test, :], y_train.iloc[test, 0])) 
       for train, test in kfold.split(X=x_train)] 
 for clf in clfs}

In [None]:
top_folds = [(name, max(model, key=lambda x: x[1])) 
 for name, model in models.items()]

In [None]:
best_classifier = max(top_folds, key=lambda x: x[1][1])

In [None]:
best_classifier

In [None]:
best_classifier[1][0].score(x_test, y_test)

In [None]:
for model in top_folds:
    y_pred = model[1][0].predict(x_test)
    plt.figure()
    plt.title(model[0])
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)

In [None]:
for model in top_folds:
    y_pred = model[1][0].predict(x_test)
    print(model[0])
    print(classification_report(y_test, y_pred))