In [None]:
import pandas as pd
import numpy as np 
import os 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest

In [None]:
path = os.path.join("Resources", "online_shoppers_intention.csv")

In [None]:
OSHIdf = pd.read_csv(path)

In [None]:
OSHIdf.head(20)

In [None]:
#list of attributes 
OSHIdf.columns

In [None]:
#Dimesions of the Data frame
OSHIdf.shape

In [None]:
OSHIdf.dtypes

In [None]:
#Attributes data type
OSHIdf.info()

In [None]:
#Checking the summary of numeric attributes 
plt.figure(figsize = (12,8))
sns.heatmap(OSHIdf.describe()[1:].transpose(),
            annot=True,linecolor = "w",
            linewidth=2, cmap=sns.color_palette("muted"))
plt.title("Data Summary")
plt.show()

In [None]:
#Detecting missing Values 
OSHIdf.isna().sum()

In [None]:
#Detecting missing Values 
OSHIdf.isnull().sum()

In [None]:
 cleanOSHIdf = OSHIdf

In [None]:
#Finding number of cases which clicked on Adminstative pages but their corresponding Adminstative_duration is 0, 
#Meaning Administative_duration is missing from that row missing_Administartive_duration = OSHIdf[(OSHIdf["Administrative"] >= 1) & (OSHIdf["Administrative_Duration"] == 0)]
missing_Administartive_duration = OSHIdf[(OSHIdf["Administrative"] >= 1) & (OSHIdf["Administrative_Duration"] == 0)]
missing_Administartive_duration.shape[0]

In [None]:
#Now we are detecting which values of adminstrative has one or more than one but show 0 as value in in their corresponding Adminstattive_duration
#We are doing these to avoid kind a replacement which will end up to reduction in variance and might effect corrolation as well 
#To avoid this we replace 0 values on the Administartive_duration columns with the median of duration on that specific group and not the median of duration for the whole column of duration
OSHIdf["Administrative"]. value_counts()
OSHIdf[(OSHIdf["Administrative"] == 1) & (OSHIdf["Administrative_Duration"] == 0)].shape[0]
OSHIdf[(OSHIdf["Administrative"] == 2) & (OSHIdf["Administrative_Duration"] == 0)].shape[0]
# so Administrative duration has 135 missing values, 131 of them has been clicked on 1 Aministarative page and 4 of them has been clicked on 2 Administrative pages, 
#we will replace 134 of zero values with mean of duration for rows with 1 as Administrative value and the rest (4 zero values) with mean of adminstrative rows with 2 as value 

In [None]:
#We want to get the mean of adminstrative_Duration when Adminstrative is 1 
OSHIdf.loc[(OSHIdf["Administrative"] == 1) & (OSHIdf["Administrative_Duration"] != 0),"Administrative_Duration"].mean()

In [None]:
#we will replace all the 0 values on adminstrative_Duration with correspond adminstrative as 1 with the mean of adminstrative_Duration when Adminstrative is 1 
cleanOSHIdf[(cleanOSHIdf["Administrative"] == 1) & (cleanOSHIdf["Administrative_Duration"] == 0)].replace({0 : 47.050223958417824})['Administrative_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["Administrative"] == 2) & (OSHIdf["Administrative_Duration"] != 0),"Administrative_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["Administrative"] == 2) & (cleanOSHIdf["Administrative_Duration"] == 0)].replace({0 : 86.5326272501054})['Administrative_Duration']

In [None]:
clean_missing_Administartive_duration = cleanOSHIdf[(cleanOSHIdf["Administrative"] >= 1) & (cleanOSHIdf["Administrative_Duration"] == 0)]
clean_missing_Administartive_duration.shape[0] #should find a way to assign these replacement back to the data frame, in this case cleanOSHIdf

In [None]:
#Finding number of cases which clicked on Informational pages but their corresponding Informational_duration ishows 0,
#Meaning Informational_duration is missing from that row 
missing_Informational_duration = OSHIdf[(OSHIdf["Informational"] >= 1) & (OSHIdf["Informational_Duration"] == 0)]
missing_Informational_duration.shape[0]

In [None]:
#Now we are detecting which values of Informational has one or more than one but show 0 as value in in their corresponding Informational_duration
OSHIdf["Informational"]. value_counts()
OSHIdf[(OSHIdf["Informational"] == 1) & (OSHIdf["Informational_Duration"] == 0)].shape[0]
OSHIdf[(OSHIdf["Informational"] == 2) & (OSHIdf["Informational_Duration"] == 0)].shape[0]

# so Informational duration has 226 missing values, 217 of them has been clicked one Informational page, 9 of them has been clicked on two Informational pages, 
#we will replace 226 of zero values with mean of duration for rows with 1 as Informational value and the rest (9 zero values) with mean of Informational rows with 2 as value 

In [None]:
OSHIdf.loc[(OSHIdf["Informational"] == 1) & (OSHIdf["Informational_Duration"] != 0),"Informational_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["Informational"] == 1) & (cleanOSHIdf["Informational_Duration"] == 0)].replace({0 : 90.6099865344199})['Informational_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["Informational"] == 2) & (OSHIdf["Informational_Duration"] != 0),"Informational_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["Informational"] == 2) & (cleanOSHIdf["Informational_Duration"] == 0)].replace({0 : 141.64488012520584})['Informational_Duration']

In [None]:
clean_missing_Informational_duration = cleanOSHIdf[(cleanOSHIdf["Informational"] >= 1) & (cleanOSHIdf["Informational_Duration"] == 0)]
clean_missing_Informational_duration.shape[0] # should find a way to assign above replacement back to the data frame, in this case cleanOSHIdf

In [None]:
#Finding number of cases which clicked on Product Related pages but their corresponding ProductRelated_Duration is 0, 
#meaning ProductRelated_Duration is missing from that row 
missing_ProductRelated_Duration = OSHIdf[(OSHIdf["ProductRelated"] >= 1) & (OSHIdf["ProductRelated_Duration"] == 0)]
missing_ProductRelated_Duration.shape[0]

In [None]:
#Now we are detecting which values of ProductRelated has one or more than one but show 0 as value in in their corresponding ProductRelated_duration
OSHIdf["ProductRelated"]. value_counts()
OSHIdf[(OSHIdf["ProductRelated"] == 13) & (OSHIdf["ProductRelated_Duration"] == 0)].shape[0]
total_missing_ProductRelated_duration = 553 + 103 + 35 + 11 + 7 + 2 + 1 + 2 + 1 + 1 + 1 
total_missing_ProductRelated_duration
# so ProductRelated duration has 717 missing values, 553 with 1 click, 103 with 2 clicks, 35 with 3 clicks, 11 with 4 clicks, 7 with 5 clicks, 2 with 6 clicks, 1 with 7, 2 with 9 and 1 for each 10, 11 and 13 clicks 
#we will replace them with the mean of the row for their correspanding ProductRelated values. 

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 1) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 1) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 151.07815906086958})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 2) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 2) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 80.47439334844474})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 3) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 3) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 127.51597533182742})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 4) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 4) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 167.65052730130535})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 5) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 5) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 191.099408254})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 6) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 6) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 246.3537186793401})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 7) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 7) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 305.2883542408718})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 9) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 9) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 376.41020841326986})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 10) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 10) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 367.33093260778116})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 11) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 11) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 490.62991092257323})['ProductRelated_Duration']

In [None]:
OSHIdf.loc[(OSHIdf["ProductRelated"] == 13) & (OSHIdf["ProductRelated_Duration"] != 0),"ProductRelated_Duration"].mean()

In [None]:
cleanOSHIdf[(cleanOSHIdf["ProductRelated"] == 13) & (cleanOSHIdf["ProductRelated_Duration"] == 0)].replace({0 : 503.35250030163195})['ProductRelated_Duration']

In [None]:
clean_missing_ProductRelated_duration = cleanOSHIdf[(cleanOSHIdf["ProductRelated"] >= 1) & (cleanOSHIdf["ProductRelated_Duration"] == 0)]
clean_missing_ProductRelated_duration.shape[0] # should find a way to assign above replacement back to the data frame, in this case cleanOSHIdf

In [None]:
#page value missing values
missing_PageValues = OSHIdf[(OSHIdf["Administrative"] >= 1) & (OSHIdf["Informational"] >= 1) & (OSHIdf["ProductRelated"] >= 1) & (OSHIdf["PageValues"] == 0)]
missing_PageValues.shape[0]

In [None]:
#Generating histograms for quantitative variables 
#Administrative 
hist_plot_Administartive = OSHIdf['Administrative'].hist(bins = 25, grid = False, color = "pink")
hist_plot_Administartive.set_title('Adminstartive Page Vistited By The User in a Session')
hist_plot_Administartive.set_xlabel('Number of Time Administrative Page Visited')
hist_plot_Administartive.set_ylabel('Count of Users')

In [None]:
#Generating histograms for quantitative variables 
#Informational
hist_plot_Informational = OSHIdf['Informational'].hist(bins = 20, grid = False, color = "green")
hist_plot_Informational.set_title('Informational Page Vistited By The User in a Session')
hist_plot_Informational.set_xlabel('Number of Time Informational Page Visited')
hist_plot_Informational.set_ylabel('Count of Users')

In [None]:
#Generating histograms for quantitative variables 
#ProductRelated
hist_plot_ProductRelated = OSHIdf['ProductRelated'].hist(bins = 20, grid = False, color = "y")
hist_plot_ProductRelated.set_title('ProductRelated Page Vistited By The User in a Session')
hist_plot_ProductRelated.set_xlabel('Number of Time ProductRelated Page Visited')
hist_plot_ProductRelated.set_ylabel('Count of Users')

In [None]:
#Generating histograms for quantitative variables 
#Administrative_Duration
hist_plot_Administrative_Duration = OSHIdf['Administrative_Duration'].hist(bins = 25, grid = False, color = "orange")
hist_plot_Administrative_Duration.set_title('Time Spent on Administrative pages')
hist_plot_Administrative_Duration.set_xlabel('Time Spent In Second')
hist_plot_Administrative_Duration.set_ylabel('Count of Users')

In [None]:
#Generating histograms for quantitative variables 
#Informational_Duration'
hist_plot_Informational_Duration = OSHIdf['Informational_Duration'].hist(bins = 25, grid = False, color = "lightblue")
hist_plot_Informational_Duration.set_title('Time Spent on Informational pages')
hist_plot_Informational_Duration.set_xlabel('Time Spent In Second')
hist_plot_Informational_Duration.set_ylabel('Count of Users')

In [None]:
#Generating histograms for quantitative variables 
#ProductRelated_Duration
hist_plot_ProductRelated_Duration = OSHIdf['ProductRelated_Duration'].hist(bins = 25, grid = False, color = "r")
hist_plot_ProductRelated_Duration.set_title('Time Spent on ProductRelated Pages')
hist_plot_ProductRelated_Duration.set_xlabel('Time Spent In Second')
hist_plot_ProductRelated_Duration.set_ylabel('Count of Users')

In [None]:
#Generating histograms for quantitative variables 
#BounceRates
hist_plot_BounceRates = OSHIdf['BounceRates'].hist(bins = 25, grid = False, color = "mediumaquamarine")
hist_plot_BounceRates.set_title('Bounce Rates, Single Request Triggered')
hist_plot_BounceRates.set_xlabel('Bounce Rate In Percentage')
hist_plot_BounceRates.set_ylabel('Count of Users')

In [None]:
#Generating histograms for quantitative variables 
#Exit Rates
hist_plot_ExitRates = OSHIdf['ExitRates'].hist(bins = 25, grid = False, color = "orangered")
hist_plot_ExitRates.set_title('Exit Rates, Percentage of People who Left the Site from that Page')
hist_plot_ExitRates.set_xlabel('Exit Rates In Percentage')
hist_plot_ExitRates.set_ylabel('Count of Users')

In [None]:
#Generating histograms for quantitative variables 
#Page Values
hist_plot_PageValues = OSHIdf['PageValues'].hist(bins = 25, grid = False, color = "m")
hist_plot_PageValues.set_title('Page Values')
hist_plot_PageValues.set_xlabel('Average Number Of Page Visited')
hist_plot_PageValues.set_ylabel('Count of Users')

In [None]:
#Checking the levels and frequency tables for categorical variables 
#frequency table for Month
pd.crosstab(index = OSHIdf["Month"], columns = "Count")

In [None]:
#Percentage of different visitors type in the dataset
plt.rcParams["figure.figsize"] = (20, 10)
size = [10551, 1694, 85]
colors = ["g", "y", "r"]
labels = "Returning Visitors", "New Visitors", "Others"
explode = [0, 0, 0.1]
plt.subplot(1, 2, 1)
plt.pie(size, colors = colors, labels = labels, explode = explode, shadow = True, autopct = '%.2f%%')
plt.title("Visitors Type", fontsize = 22)
plt.axis("off")
plt.legend()

In [None]:
#frequency table for Weekend
pd.crosstab(index = OSHIdf["Weekend"], columns = "Count")

In [None]:
#Plotting the Revenue 
plt.figure(figsize =(10, 6))
print("Did not buy:", len(OSHIdf[OSHIdf.Revenue == 0]))
print("Bought:", len(OSHIdf[OSHIdf.Revenue == 1]))
y = len(OSHIdf[OSHIdf.Revenue == 0]), len(OSHIdf[OSHIdf.Revenue == 1])
x = ["Did not buy", "Bought"]
plt.bar(x, y, color = "lightgreen")
plt.show 

In [None]:
#frequency table for Revenue 
pd.crosstab(index = OSHIdf["Revenue"], columns = "Count")

In [None]:
OSHIy = OSHIdf.iloc[:, :17:18]
OSHIy.shape

In [None]:
cleanOSHIX = OSHIX

In [None]:
#Handling non-numeric features/creating dummy or indicator variables 
cleanOSHIX = pd.get_dummies(cleanOSHIX, columns=['Month', 'VisitorType', 'Weekend'], drop_first = True)

In [None]:
cleanOSHIX.shape

In [None]:
cleanOSHIX.head(20)

In [None]:
#In python we need to distinguish between the matrix of features(Independent variables) and the dependent variable vector
OSHIX = OSHIdf.iloc[:, :-1]
OSHIX.shape

In [None]:
cleanOSHIX["Administrative"].value_counts()

In [None]:
#6562 user clicked on Administative pages and 5768 of them clicked on the other two(informational or product_related pages)
(cleanOSHIX["Administrative"] >= 1).value_counts()

In [None]:
##9699 user clicked on AInformational pages and 2631 of them clicked on the other two(Adminstartive or product_related pages)
(cleanOSHIX["Informational"] >= 1).value_counts()

In [None]:
##12292 of user clicked on Administative pages and 38 of them clicked on the other two(informational or Administrative)
(cleanOSHIX["ProductRelated"] >= 1).value_counts()

In [None]:
OSHIdf.info()

In [None]:
bool = ['Weekend', 'Revenue']

In [None]:
categorical = ['Month', 'VisitorType']

In [None]:
type(numerical)

In [None]:
OSHIdf['Month'].value_counts().count()

In [None]:
OSHIdf['PageValues'].value_counts().count()

In [None]:
(OSHIdf['Administrative_Duration']!= 0).value_counts()

In [None]:
(OSHIdf['Administrative']!= 0).value_counts()

In [None]:
6562 - 6427 

In [None]:
OSHIdf.shape

In [None]:
(OSHIdf['ProductRelated_Duration']!= 0).value_counts()

In [None]:
(OSHIdf['PageValues']!= 0).value_counts().groupby

In [None]:
12292 - 11575

In [None]:
OSHIdf.loc[(OSHIdf["Administrative"] == 1) & (OSHIdf["Administrative_Duration"] == 0),"Administrative_Duration"].count()

In [None]:
OSHIdf.loc[(OSHIdf["Administrative"] == 1) & (OSHIdf["Administrative_Duration"] != 0),"Administrative_Duration"].mean()

In [None]:
OSHIdf[(OSHIdf["Administrative"] == 1) & (OSHIdf["Administrative_Duration"] == 0)].replace({0 : 47.050223958417824})['Administrative_Duration']