In [6]:
# imports
import numpy as np
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sklearn.cluster as cluster
import math
df = pd.read_csv('online_shoppers_intention.csv', na_values=['NA', 'null', '', 'NULL'])

In [2]:
# 1.1 Getting our first look at the Dataset using the describe function on our dataset
df_cpy = df.copy()

df.describe()

# Feature groups that have similar representation
num_pages_visited_features = ['Administrative', 'Informational', 'ProductRelated']
total_duration_page_features = ['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration']

# Calculate means for number of visits for every page type
mean_num_pages_visited = df[num_pages_visited_features].mean()

# Show above calculations in a bar plot
fig = go.Figure()
fig.add_trace(go.Bar(x=num_pages_visited_features, y=mean_num_pages_visited, name="mean per page category"))
fig.show()

# Calculate average duration per page for every page type
total_duration = df[total_duration_page_features].sum()
total_pages_visited = df[num_pages_visited_features].sum().astype(float)
avg_duration_per_page = total_duration.values / total_pages_visited.values

# Show above calculations in a barplot
fig2 = go.Figure()
fig2.add_trace(go.Bar(x=num_pages_visited_features, y=avg_duration_per_page, name="average duration per page"))
fig2.show()

In [3]:
# 1.2 Comparing characteristics of the dataset for the two different types of users

# Filter dataset on browser == 13 and != 13
loc_df_browsers_13 = df.loc[df['Browser'] == 13, ['Revenue']]
loc_df_browsers_not_13 = df.loc[df['Browser'] != 13]

# Calculate means for number of visits for each page type 
mean_browsers_13_ints = loc_df_browsers_13[num_pages_visited_features].mean()
mean_browsers_not_13_ints = loc_df_browsers_not_13[num_pages_visited_features].mean()

# Show above calculations in a barplot in which we can compare browser ==13 and !=13
fig = go.Figure()
fig.add_trace(go.Bar(x=num_pages_visited_features, y=mean_browsers_13_ints, name='Browser == 13', marker_color='blue'))
fig.add_trace(go.Bar(x=num_pages_visited_features, y=mean_browsers_not_13_ints, name='Browser != 13', marker_color='orange'))
fig.update_layout(title='Comparison of Feature Means: Browser == 13 vs Browser != 13',xaxis_title='Features',yaxis_title='Mean Value',barmode='group')
fig.show()

# Calculate average duration per page for every page type for browser==13
total_duration_13 = loc_df_browsers_13[total_duration_page_features].sum()
total_pages_visited_13 = loc_df_browsers_13[num_pages_visited_features].sum().astype(float)
avg_duration_per_page_13 = total_duration_13.values / total_pages_visited_13.values

# Calculate average duration per page for every page type for browser!=13
total_duration_not_13 = loc_df_browsers_not_13[total_duration_page_features].sum()
total_pages_visited_not_13 = loc_df_browsers_not_13[num_pages_visited_features].sum().astype(float)
avg_duration_per_page_not_13 = total_duration_not_13.values / total_pages_visited_not_13.values

# Show above two calculations in a barplot in which we can compare browser ==13 and !=13
fig2 = go.Figure()
fig2.add_trace(go.Bar(x=num_pages_visited_features, y=avg_duration_per_page_13, name="average duration per page for browser 13"))
fig2.add_trace(go.Bar(x=num_pages_visited_features, y=avg_duration_per_page_not_13, name="average duration per page for browser NOT 13"))
fig2.show()


display(loc_df_browsers_13.describe())
display(loc_df_browsers_not_13.describe())



Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,1.754098,73.71653,0.229508,4.717486,14.885246,699.388395,0.034373,0.052842,26.268249,0.0,7.196721,13.0,9.0,19.754098
std,2.742601,222.460019,0.80402,29.131613,29.325244,1401.28515,0.074365,0.068493,61.644757,0.0,2.096184,0.0,0.0,1.920553
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.00125,0.0,0.0,1.0,13.0,9.0,5.0
25%,0.0,0.0,0.0,0.0,2.0,67.666667,0.0,0.008696,0.0,0.0,8.0,13.0,9.0,20.0
50%,0.0,0.0,0.0,0.0,7.0,302.0,0.0,0.025,0.0,0.0,8.0,13.0,9.0,20.0
75%,3.0,77.333333,0.0,0.0,18.0,784.883333,0.0,0.05,8.671344,0.0,8.0,13.0,9.0,20.0
max,14.0,1652.0,4.0,225.766667,222.0,9630.209524,0.2,0.2,360.953384,0.0,8.0,13.0,9.0,20.0


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12269.0,12269.0,12269.0,12269.0,12269.0,12269.0,12269.0,12269.0,12269.0,12269.0,12269.0,12269.0,12269.0,12269.0
mean,2.317956,80.853921,0.504931,34.620336,31.815225,1197.20908,0.022131,0.043024,5.787936,0.061733,2.098786,2.304181,3.118266,3.991605
std,3.324267,176.533147,1.271921,141.068395,44.522824,1915.596413,0.048322,0.048476,18.051056,0.199364,0.827386,1.548446,2.371741,3.877528
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,185.0,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,8.0,0.0,0.0,18.0,600.75,0.003165,0.025181,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.5,0.0,0.0,38.0,1470.416667,0.016901,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,12.0,9.0,20.0


In [17]:
# 5.1 
x=(1,2)
y=(3,4)


def euclidean_distance(x, y):
    sum = 0
    for i in range(len(x)):
        sum += (x[i] - y[i]) ** 2
    return math.sqrt(sum)


print(euclidean_distance(x,y))

# 5.2
def manhatten_distance(x, y):
    sum = 0
    for i in range(len(x)):
        sum += abs(x[i] - y[i])
    return sum


print(manhatten_distance(x,y))

# 5.3
def cosine_similarity(x, y):
    top = 0
    x_summed = 0
    y_summed = 0
    for i in range(len(x)):
        top += x[i] * y[i]
        x_summed += x[i] ** 2
        y_summed += y[i] ** 2
    return top / (math.sqrt(x_summed) * math.sqrt(y_summed))

print(cosine_similarity(x,y))

# 5.4
euclidean_output = euclidean_distance(x,y)
manhatten_output = manhatten_distance(x,y)
cosine_output = cosine_similarity(x,y)

# cluster1 = cluster.DBSCAN(eps=(euclidean_distance(x,y)), min_samples=2).fit(loc_df_browsers_13)
# print(cluster1.labels_)





2.8284271247461903
4
0.9838699100999074


ValueError: could not convert string to float: 'June'