In [None]:
# Import the necessary libraries that will be used in this code - pandas for data manipulation, numpy for numerical calculations, and seaborn for data visualization. The collections library is also imported to use the Counter function.
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import classification_report

In [None]:
# load dataset
data = pd.read_csv("Resources/online_shoppers_intention.csv")
df = data.copy()
print(df.dtypes)

In [None]:
# display top 5 rows of the dataset
df.head()


Number of true (customer ended shopping) and false (customer ended not shopping) revenue (Bar Chart).

In [None]:
sns.countplot(df['Revenue'])

In [None]:
non_numeric_columns = 'Month', 'VisitorType', 'Weekend', 'Revenue'

# Create an instance of the OrdinalEncoder
encoder = OrdinalEncoder()

for col in non_numeric_columns:
    encoder.fit(df[[col]])
    df[col] = encoder.transform(df[[col]])

In [None]:
# split data into train and test sets
X_data = df.drop('Revenue', axis=1)
y_data = df['Revenue']
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestRegressor(n_estimators=150)
rf.fit(X_train, y_train)

In [None]:
# create a list of column labels from the integer indices
labels = X.columns

# sort the feature importances and the column labels
sort = rf.feature_importances_.argsort()
labels_sorted = labels[sort][-10:]  # select the top 10 features based on their importances

# plot the feature importances with the column labels
plt.barh(labels_sorted, rf.feature_importances_[sort][-10:])
plt.xlabel("Feature Importance")


In [None]:
# select the top 10 features using the sort variable
top10_features = X_data.columns[sort][-10:]

# create a new DataFrame with only the top 10 features
X_top10 = X_data[top10_features]

X_top10.head()


## Linear Model with All Features

In [None]:
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

In [None]:
X_train.head()

### Number of true (customer ended shopping) and false (customer ended not shopping) revenue (Bar Chart).

In [None]:

sns.countplot(df['Revenue'])

### •	Distribution of revenue over months.
The resulting bar plot shows the distribution of revenue over the months. The x-axis represents the months, and the y-axis represents the count of instances for each month. The bars are color-coded to show the revenue column's value, with blue representing False (no revenue) and orange representing True (revenue). The plot provides an easy-to-read visualization of the distribution of revenue over the months, allowing for easy comparison between the different months.

In [None]:

revenue_df = df.sort_values('Month')

# Set the style of the plot
sns.set_style("whitegrid")

# Create a barplot of the revenue distribution over months
sns.countplot(x="Month", hue="Revenue", data=df)

# Add labels to the plot
plt.title("Distribution of Revenue over Months")
plt.xlabel("Month")
plt.ylabel("Count")

# Show the plot
plt.show()


### Distribution of revenue over traffic type.
In this case, we are using the mean revenue for each traffic type because the traffic types are represented as numerical values without any meaningful labels.

If we had meaningful labels for the traffic types, we could use a count plot to show the number of occurrences of each traffic type for each revenue value (i.e., True or False). However, since we don't have labels for the traffic types, we can't use a count plot in this case. Instead, we can use the mean revenue for each traffic type as a proxy for the revenue generated by each traffic type.

For example, if we have 100 instances of traffic type 1 with a revenue value of True and 50 instances of traffic type 1 with a revenue value of False, the mean revenue for traffic type 1 would be (100 * 1 + 50 * 0) / (100 + 50) = 0.67. This mean value represents the average revenue generated by traffic type 1. We can then compare this mean revenue value to the mean revenue values for other traffic types to gain insight into which traffic types generate the most revenue on average.

In [None]:
sns.barplot(x="TrafficType", y="Revenue", data=df)
plt.title("Mean Revenue by Traffic Type")
plt.xlabel("Traffic Type")
plt.ylabel("Mean Revenue")
plt.show()


### Distribution of revenue over special day.
The code below uses the seaborn library to create a swarm plot of the distribution of revenue over the "SpecialDay" feature.

A swarm plot is a type of categorical scatter plot that displays the distribution of data points for each category along an axis. In this case, the "SpecialDay" values are shown along the x-axis, and the revenue values are shown along the y-axis. Each point on the plot represents a single data point from the dataset, with its location on the x-axis corresponding to its "SpecialDay" value and its location on the y-axis corresponding to its revenue value (0 or 1).

By using different colors for the revenue values, we can easily see how the distribution of revenue changes for each "SpecialDay" value. In this plot, orange points represent revenue=1, while blue points represent revenue=0.

The plot provides useful information about the relationship between the "SpecialDay" feature and the revenue outcome. For example, we can see that on special days (SpecialDay values of 0.4, 0.6, and 0.8), there are fewer instances of revenue=1 than on non-special days (SpecialDay values of 0.0 and 1.0). This could suggest that customers are less likely to make purchases on special days, or that the website is less effective at converting visitors into customers on these days. However, it's important to keep in mind that this plot only shows correlation, and not causation.

In [None]:
# Set the style of the plot
sns.set_style("whitegrid")

# Create a histogram of the revenue distribution over SpecialDay
sns.histplot(x="SpecialDay", hue="Revenue", data=df, multiple="stack")

# Add labels to the plot
plt.title("Distribution of Revenue over SpecialDay")
plt.xlabel("SpecialDay")
plt.ylabel("Count")

# Show the plot
plt.show()


In [None]:
result_dict = {}
# Helper function to summarize
def summarize_classification(y_test,y_pred):
    """
    As it's take the actual target labels of the test set and predicted label.
    and will Give the summary of the Goodness of fit of the mode on the Validation/test dataset.
    """
    acc = accuracy_score(y_test,y_pred,normalize=True)
    num_acc = accuracy_score(y_test,y_pred,normalize=False)
    
    prec = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    F1_score =  f1_score(y_test,y_pred)
    auc_score = roc_auc_score(y_test,y_pred)
    
    
    return{'Accuracy:': acc,
           'Accuracy_count:': num_acc,
           'Precision:': prec,
           'Recall:': recall,
           'F1_score:':F1_score,
           'AUC_ROC:':auc_score}

## Building a Base Model

In [None]:
# Helper function to Build Model
def build_model(classifier_fn,
                name_of_y_col,
                name_of_x_cols,
                dataset,test_frac=0.2,
                show_plot_auc=None):
  
    """
    Builds end to end model and share the model summary.
    if show_plot_auc==True : Plot the AUC - ROC curve.
    """ 
    
    # Separating the  input features (X) and target variable (y)
    X = df.drop('Revenue', axis=1)
    y = df['Revenue']
    
    # feature Scaling
    scale_x = StandardScaler()
    x = scale_x.fit_transform(X)
    
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
    
    model = classifier_fn(x_train,y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train,y_pred_train)
    test_summary = summarize_classification(y_test,y_pred)
    
    pred_result = pd.DataFrame({'y_test':y_test,'y_pred':y_pred})
    
    model_crosstab = pd.crosstab(pred_result.y_pred,pred_result.y_test)
    
    return{'training':train_summary,
          'test':test_summary,
          'confusion_matrix':model_crosstab
          }


## Compare Results

In [None]:
# Helper function to compare the score of different Model.    
def compare_result():
    """
    Shows Train the Test data summary for the all the Model runned in form of data dictionary.
    """
    for key in result_dict:
        print('Classification: ',key)
        
        print()
        print('Training data:-')
        for score in result_dict[key]['training']:
            print(score,result_dict[key]['training'][score])
            
        print()
        print('Test Data:-')
        for score in result_dict[key]['test']:
            print(score,result_dict[key]['test'][score])
            
        print()

## Model 1: Neural Networks

In [None]:
def dnn_fn(X_train, y_train, input_dim=10, output_dim=1):
    model = Sequential()
    model.add(Dense(32, input_dim=input_dim, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(output_dim, activation='linear'))

    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, y_train,
              epochs=100, batch_size=32, verbose=0)

    return model


In [None]:
result_dict['Revenue ~ DNN'] = build_model(
    dnn_fn, y, X, df)


## Model 2: Unsupervised Learning - K-Nearest Neighbours (KNN)

In [None]:
def knn_fn(X_train,y_train,n_neighbors=9,random_state=12) -> KNeighborsClassifier:
    """
    Function to buld KNN Model for the given dataset
    """
    
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    model.fit(X_train,y_train)
    
    return model


In [None]:
result_dict['Revenue ~ KNN'] = \
    build_model(knn_fn,y,X,df)

In [None]:
from sklearn.metrics import classification_report


y_pred = result_dict['Revenue ~ KNN']['test']['y_pred']
y_true = result_dict['Revenue ~ KNN']['test']['y_true']
print(classification_report((y_true, y_pred)))


## Model 3: Supervised Learning - Random Forest Classifier

In [None]:
def random_forest_fn(x_train,y_train) -> RandomForestClassifier:
    """
    Function to buld ensemble model using 50 decision trees for the given dataset
    """
    # create a random forest classifier model
    model = RandomForestClassifier(n_estimators= 50, max_depth = 15,random_state=12 )

    # fit the model to the training data
    model.fit(x_train,y_train)
    
    return model

In [None]:
result_dict['Revenue ~ Random_Forest'] = \
    build_model(random_forest_fn,y,X,df)

In [None]:
print(compare_result())