In [None]:
# Import the necessary libraries that will be used in this code - pandas for data manipulation, numpy for numerical calculations, and seaborn for data visualization. The collections library is also imported to use the Counter function.
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
# load dataset
data = pd.read_csv("Resources/online_shoppers_intention.csv")
df = data.copy()
print(df.dtypes)

In [None]:
# display top 5 rows of the dataset
df.head()


In [None]:
non_numeric_columns = 'Month', 'VisitorType', 'Weekend', 'Revenue'

# Create an instance of the OrdinalEncoder
encoder = OrdinalEncoder()

for col in non_numeric_columns:
    encoder.fit(df[[col]])
    df[col] = encoder.transform(df[[col]])

In [None]:
# split data into train and test sets
X = df.drop('Revenue', axis=1)
y = df['Revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf = RandomForestRegressor(n_estimators=150)
rf.fit(X_train, y_train)

In [None]:
# create a list of column labels from the integer indices
labels = X.columns

# sort the feature importances and the column labels
sort = rf.feature_importances_.argsort()
labels_sorted = labels[sort][-10:]  # select the top 10 features based on their importances

# plot the feature importances with the column labels
plt.barh(labels_sorted, rf.feature_importances_[sort][-10:])
plt.xlabel("Feature Importance")


In [None]:
# select the top 10 features using the sort variable
top10_features = X.columns[sort][-10:]

# create a new DataFrame with only the top 10 features
X_top10 = X[top10_features]


## Linear Model with All Features

In [None]:
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

In [None]:
X_train.head()

### Number of true (customer ended shopping) and false (customer ended not shopping) revenue (Bar Chart).

In [None]:

sns.countplot(df['Revenue'])

### •	Distribution of revenue over months.

In [None]:

revenue_df = df.sort_values('Month')

pd.crosstab(revenue_df['Month'],revenue_df['Revenue']).plot(kind='line',figsize=(9,8),title="Distribution of Revenue(Target Variable) over Months")

plt.show();

### Distribution of revenue over traffic type.

In [None]:
pd.crosstab(df['TrafficType'],df['Revenue']).plot(kind='line',figsize=(9,8),title="Distribution of Revenue over TrafficType")

plt.show();

### Distribution of revenue over special day.

In [None]:
pd.crosstab(df['SpecialDay'],df['Revenue']).plot(kind='line',figsize=(9,8),title="Distribution of Revenue over SpecialDay")

plt.show();

In [None]:
result_dict = {}
# Helper function to summarize
def summarize_classification(y_test,y_pred):
    """
    As it's take the actual target labels of the test set and predicted label.
    and will Give the summary of the Goodness of fit of the mode on the Validation/test dataset.
    """
    acc = accuracy_score(y_test,y_pred,normalize=True)
    num_acc = accuracy_score(y_test,y_pred,normalize=False)
    
    prec = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    F1_score =  f1_score(y_test,y_pred)
    auc_score = roc_auc_score(y_test,y_pred)
    
    
    return{'Accuracy:': acc,
           'Accuracy_count:': num_acc,
           'Precision:': prec,
           'Recall:': recall,
           'F1_score:':F1_score,
           'AUC_ROC:':auc_score}

## Building a Base Model

In [None]:

# Helper function to Build Model
def build_model(classifier_fn,
                name_of_y_col,
                name_of_x_cols,
                dataset,test_frac=0.2,
                show_plot_auc=None):
  
    """
    Builds end to end model and share the model summary.
    if show_plot_auc==True : Plot the AUC - ROC curve.
    """ 
    # select the top 10 features using the sort variable
    sort = X_top10.abs().sum(axis=1).sort_values(ascending=False).index
    top10_features = X.columns[sort][-10:]

    # create a new DataFrame with only the top 10 features
    X = X[top10_features]

    # Split your data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_top10, y, test_size=test_frac, random_state=0)

    # Apply under-sampling to the training data only
    undersampler = RandomUnderSampler(random_state=0)
    X_train_resampled, y_train_resampled = undersampler.fit_resample(
        X_train, y_train)

    # scale numeric features using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)
        
    model = classifier_fn(X_train_scaled, y_train_resampled)
    
    y_pred = model.predict(X_test_scaled)
    
    y_pred_train = model.predict(X_train_scaled)
    
    train_summary = summarize_classification(X_train_scaled, y_pred_train)
    test_summary = summarize_classification(y_test,y_pred)
    
    pred_result = pd.DataFrame({'y_test':y_test,'y_pred':y_pred})
    
    model_crosstab = pd.crosstab(pred_result.y_pred,pred_result.y_test)
    
    if show_plot_auc==True:
        plt.figure(figsize=(8,6))
        
        logit_roc_auc1 = roc_auc_score(y_train, model.predict(x_train))
        fpr1, tpr1, thresholds1 = roc_curve(y_train, model.predict_proba(x_train)[:,1])
        plt.plot(fpr1, tpr1, label='Class_Train (AUC = %0.2f)' % logit_roc_auc1)
            
        logit_roc_auc2 = roc_auc_score(y_test, model.predict(x_test))
        fpr2, tpr2, thresholds2 = roc_curve(y_test, model.predict_proba(x_test)[:,1])
        plt.plot(fpr2, tpr2,label='Class_Test (AUC = %0.2f)' % logit_roc_auc2)
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic(ROC-AUC)')
        plt.legend(loc="lower right")
        plt.show()
        
    
    return{'training':train_summary,
          'test':test_summary,
          'confusion_matrix':model_crosstab
          }

## Compare Results

In [None]:
# Helper function to compare the score of different Model.    
def compare_result():
    """
    Shows Train the Test data summary for the all the Model runned in form of data dictionary.
    """
    for key in result_dict:
        print('Classification: ',key)
        
        print()
        print('Training data:-')
        for score in result_dict[key]['training']:
            print(score,result_dict[key]['training'][score])
            
        print()
        print('Test Data:-')
        for score in result_dict[key]['test']:
            print(score,result_dict[key]['test'][score])
            
        print()

## Model 1: Neural Networks

In [None]:
def dnn_fn(X_train_scaled, y_train_scaled, input_dim=17, output_dim=1) -> Sequential:
    """
    Returns a deep neural network model for regression.
    """
    model = Sequential()
    model.add(Dense(units=32, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(units=output_dim, activation='linear'))

    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train_scaled, y_train_scaled, epochs=100, batch_size=32, verbose=0)

    return model

In [None]:
result_dict['Revenue ~ DNN'] = build_model(
    dnn_fn, name_of_y_col=y, name_of_x_cols=X, dataset=df, test_frac=0.2, show_plot_auc=True)


## Model 2: Unsupervised Learning - K-Nearest Neighbours (KNN)

In [None]:
def knn_fn(X_train_scaled,y_train_scaled,n_neighbors=9,random_state=12) -> KNeighborsClassifier:
    """
    Function to buld KNN Model for the given dataset
    """
    
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    model.fit(X_train_scaled,y_train_scaled)
    
    return model


In [None]:
result_dict['Revenue ~ KNN'] = \
    build_model(knn_fn,y,X_top10,df,show_plot_auc=True)

In [None]:
y_pred = result_dict['Revenue ~ KNN']['test']['y_pred']
y_true = result_dict['Revenue ~ KNN']['test']['y_true']
print(classification_report(y_true, y_pred))


## Model 3: Supervised Learning - Random Forest Classifier

In [None]:
def random_forest_fn(x_train,y_train) -> RandomForestClassifier:
    """
    Function to buld ensemble model using 50 decision trees for the given dataset
    """
    # create a random forest classifier model
    model = RandomForestClassifier(n_estimators= 50, max_depth = 15,random_state=12 )

    # fit the model to the training data
    model.fit(x_train,y_train)
    
    return model

In [None]:
result_dict['Revenue ~ Random_Forest'] = \
    build_model(random_forest_fn,y,X,df,show_plot_auc=True)

In [None]:
print(compare_result())