In [1]:
# imports
import numpy as np
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math
df = pd.read_csv('online_shoppers_intention.csv', na_values=['NA', 'null', '', 'NULL'])

df['Month'] = df['Month'].astype('object')
df['OperatingSystems'] = df['OperatingSystems'].astype('object')
df['Browser'] = df['Browser'].astype('object')
df['Region'] = df['Region'].astype('object')
df['TrafficType'] = df['TrafficType'].astype('object')
df['VisitorType'] = df['VisitorType'].astype('object')
df['Weekend'] = df['Weekend'].astype('object')
df['Revenue'] = df['Revenue'].astype('object')

In [2]:
# 1.1 Getting our first look at the Dataset using the describe function on our dataset
df_cpy = df.copy()
df.head()

df.describe()

# Feature groups that have similar representation
num_pages_visited_features = ['Administrative', 'Informational', 'ProductRelated']
total_duration_page_features = ['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration']

# Calculate means for number of visits for every page type
mean_num_pages_visited = df[num_pages_visited_features].mean()

# Show above calculations in a bar plot
fig = go.Figure()
fig.add_trace(go.Bar(x=num_pages_visited_features, y=mean_num_pages_visited, name="mean per page category"))
fig.show()

# Calculate average duration per page for every page type
total_duration = df[total_duration_page_features].sum()
total_pages_visited = df[num_pages_visited_features].sum().astype(float)
avg_duration_per_page = total_duration.values / total_pages_visited.values

# Show above calculations in a barplot
fig2 = go.Figure()
fig2.add_trace(go.Bar(x=num_pages_visited_features, y=avg_duration_per_page, name="average duration per page"))
fig2.show()

In [3]:
# 1.2 Comparing characteristics of the dataset for the two different types of users

# Filter dataset on browser == 13 and != 13
loc_df_browsers_13 = df.loc[df['Browser'] == 13]
loc_df_browsers_not_13 = df.loc[df['Browser'] != 13]

# Calculate means for number of visits for each page type 
mean_browsers_13_ints = loc_df_browsers_13[num_pages_visited_features].mean()
mean_browsers_not_13_ints = loc_df_browsers_not_13[num_pages_visited_features].mean()

# Show above calculations in a barplot in which we can compare browser ==13 and !=13
fig = go.Figure()
fig.add_trace(go.Bar(x=num_pages_visited_features, y=mean_browsers_13_ints, name='Browser == 13'))
fig.add_trace(go.Bar(x=num_pages_visited_features, y=mean_browsers_not_13_ints, name='Browser != 13'))
fig.update_layout(title='Comparison of Feature Means: Browser == 13 vs Browser != 13',xaxis_title='Page types',yaxis_title='Mean Value')
fig.show()

# Calculate average duration per page for every page type for browser==13
total_duration_13 = loc_df_browsers_13[total_duration_page_features].sum()
total_pages_visited_13 = loc_df_browsers_13[num_pages_visited_features].sum().astype(float)
avg_duration_per_page_13 = total_duration_13.values / total_pages_visited_13.values

# Calculate average duration per page for every page type for browser!=13
total_duration_not_13 = loc_df_browsers_not_13[total_duration_page_features].sum()
total_pages_visited_not_13 = loc_df_browsers_not_13[num_pages_visited_features].sum().astype(float)
avg_duration_per_page_not_13 = total_duration_not_13.values / total_pages_visited_not_13.values

# Show above two calculations in a barplot in which we can compare browser ==13 and !=13
fig2 = go.Figure()
fig2.add_trace(go.Bar(x=num_pages_visited_features, y=avg_duration_per_page_13, name="Browser == 13"))
fig2.add_trace(go.Bar(x=num_pages_visited_features, y=avg_duration_per_page_not_13, name="Browser != 13"))
fig2.update_layout(title='Average duration per page visit: Browser == 13 vs Browser != 13',
    xaxis_title='Page types',
    yaxis_title='Average duration per page visit',
    yaxis=dict(range=[0,100]),
)
fig2.show()

# Calculate percentages for when revenue is true for both browser ==13 and !=13
sum_revenue_13 = loc_df_browsers_13.loc[loc_df_browsers_13['Revenue']]
percentage_revenue_13 = (len(sum_revenue_13) / len(loc_df_browsers_13)) * 100

sum_revenue_not_13 = loc_df_browsers_not_13.loc[loc_df_browsers_not_13['Revenue']]
percentage_revenue_not_13 = (len(sum_revenue_not_13) / len(loc_df_browsers_not_13)) * 100

# Show above percentage calculations in a barplot to compare browser ==13 and !=13
fig3 = go.Figure()
fig3.add_trace(go.Bar(x=['Browser == 13'], y=[percentage_revenue_13], name='Browser == 13'))
fig3.add_trace(go.Bar(x=['Browser != 13'], y=[percentage_revenue_not_13], name='Browser != 13'))
fig3.update_layout(title='Revenue Conversion Rate: Browser == 13 vs Browser != 13',
    xaxis_title='Browser Group',
    yaxis_title='% of Users with Revenue == True',
    yaxis=dict(range=[0,100]),
)
fig3.show()



In [4]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems            object
Browser                     object
Region                      object
TrafficType                 object
VisitorType                 object
Weekend                     object
Revenue                     object
dtype: object

In [5]:
# Histograms (frequencies)
month_h = px.histogram(df, x='Month')
os_h = px.histogram(df, x='OperatingSystems')
browser_h = px.histogram(df, x='Browser')
region_h = px.histogram(df, x='Region')
traffic_type_h = px.histogram(df, x='TrafficType')
visitor_type_h = px.histogram(df, x='VisitorType')
weekend_h = px.histogram(df, x='Weekend')
revenue_h = px.histogram(df, x='Revenue')

month_h.show()
os_h.show()
browser_h.show()
region_h.show()
traffic_type_h.show()
visitor_type_h.show()
weekend_h.show()
revenue_h.show()

There is no january and april. 
What means other?
Week day 1892.4 visitors on average and 1434 visitors in the weekend. A differnce of 458.4 visitors per day on average. 


Let's do percentages, for the reveneu, and let's show also like a number of visitors per day.  

os_h = px.histogram(df, x='OperatingSystems')
browser_h = px.histogram(df, x='Browser')
region_h = px.histogram(df, x='Region')
traffic_type_h = px.histogram(df, x='TrafficType') 
What the different categories mean?? 

In [6]:
variables = [
    'Administrative',
    'Administrative_Duration',
    'Informational',
    'Informational_Duration',
    'ProductRelated',
    'ProductRelated_Duration',
    'BounceRates',
    'ExitRates',
    'PageValues',
    'SpecialDay'
]
for var in variables:
    fig = px.box(df, y=var, title=f'Boxplot of {var}')
    fig.show()

In [7]:
df_cp = df.copy()
df_cp.dropna(inplace=True)

# 3.2
# Sinlge column normalization
def lin_norm(val, col):
    val = val[[col]]
    max_val = val.max()
    min_val = val.min()
    return (val - min_val) / (max_val - min_val)

In [8]:
# List of variables to normalize
variables = [
    'Administrative',
    'Administrative_Duration',
    'Informational',
    'Informational_Duration',
    'ProductRelated',
    'ProductRelated_Duration',
    'BounceRates',
    'ExitRates',
    'PageValues',
    'SpecialDay'
]

# Copy the dataframe and drop missing values
df_cp = df.copy()
df_cp.dropna(inplace=True)

# Sinlge column normalization
def lin_norm(df, col):
    max_val = df[col].max()
    min_val = df[col].min()
    return (df[col] - min_val) / (max_val - min_val)


# Normalize all specified variables
for var in variables:
    df_cp[var] = lin_norm(df_cp, var)

# Display the first few rows of the normalized dataframe
df_cp.head()


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0.0,0.0,0.0,0.0,0.001418,0.0,1.0,1.0,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0.0,0.0,0.0,0.0,0.002837,0.001,0.0,0.5,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0.0,0.0,0.0,0.0,0.001418,0.0,1.0,1.0,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0.0,0.0,0.0,0.0,0.002837,4.2e-05,0.25,0.7,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0.0,0.0,0.0,0.0,0.014184,0.009809,0.1,0.25,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [9]:



# Verifying the changes


KeyboardInterrupt: 

In [13]:

# Data Cleaning, taking away outliers


import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt

def removal_box_plot(df, column, upper_threshold):
    # Define lower threshold based on the 0th percentile
    lower_threshold = df[column].min()
    
    # Original box plot
    # sns.boxplot(x=df[column])
    # plt.title(f'Original Box Plot of {column}')
    # plt.show()

    # Remove outliers based on thresholds
    removed_outliers = df[(df[column] >= lower_threshold) & (df[column] <= upper_threshold)]
    outlier_count = df.shape[0] - removed_outliers.shape[0]

    if outlier_count == 0:
        # print(f"No outliers removed from {column}.")
        return df  # No outliers to remove, return original DataFrame

    # print(f"{outlier_count} outliers removed from {column}.")

    # Box plot after removing outliers
    # sns.boxplot(x=removed_outliers[column])
    # plt.title(f'Box Plot without Outliers of {column}')
    # plt.show()

    return removed_outliers

# Assuming df is your original DataFrame
df_cp = df.copy().dropna()

# Set your upper thresholds for each variable
thresholds = {
    'Administrative': 20,
    'Administrative_Duration': 1500,
    'Informational': 10,
    'Informational_Duration': 1500,
    'ProductRelated': 400,
    'ProductRelated_Duration': 15000,
    'BounceRates': 0.12,
    'ExitRates': 0.15,
    'PageValues': 100,
    'SpecialDay': 0.4
}

# Initialize an empty DataFrame for cleaned data
cleaned_data = df_cp

# Remove outliers for each column in the Dataset
for column, upper_threshold in thresholds.items():
    cleaned_data = removal_box_plot(cleaned_data, column, upper_threshold)

cleaned_data.reset_index(drop=True, inplace=True) # Reset index

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch

# One-hot encode the categorical columns to prepare the dataset for clustering
def one_hot_encode(df):
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    #Initialize OneHotEncoder
    encoder = OneHotEncoder(sparse_output=False)
    # Apply one-hot encoding to the categorical columns
    one_hot_encoded = encoder.fit_transform(df[categorical_columns])
    #Create a DataFrame with the one-hot encoded columns
    #We use get_feature_names_out() to get the column names for the encoded data
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
    # Concatenate the one-hot encoded dataframe with the original dataframe
    df_encoded = pd.concat([df, one_hot_df], axis=1)
    # Drop the original categorical columns
    df_encoded = df_encoded.drop(categorical_columns, axis=1)
    return df_encoded



# # 3.1 Apply Sklearn's Affinity Propagation clustering. Visualize the created clusters
df_cp_affinity = cleaned_data.copy()
df_cp_affinity.dropna(inplace=True)
df_cp_affinity = one_hot_encode(df_cp_affinity)


# Normalize the dataset
for var in variables:
    df_cp_affinity[var] = lin_norm(df_cp_affinity, var)

# Running the Affinity Propagation Algorithm
affinity_propagation = AffinityPropagation()
df_cp_affinity['cluster'] = affinity_propagation.fit_predict(df_cp_affinity)

# Plotting the results
fig = px.scatter(df_cp_affinity, x=df_cp.columns[0], y=df_cp_affinity.columns[1], color='cluster', title='Affinity Propagation Clustering')
fig.show()



# 3.2  Apply Sklearn's DBSCAN clustering. Visualize the created clusters
df_cp_DB = df.copy()
df_cp_DB.dropna(inplace=True)
df_cp_DB = one_hot_encode(df_cp_DB)

# Normalize the dataset
for var in variables:
    df_cp_DB[var] = lin_norm(df_cp_DB, var)

# EPS dist, min_samples = min points in a cluster
dbscan = DBSCAN(eps=0.3, min_samples=10)
df_cp_DB['cluster'] = dbscan.fit_predict(df_cp_DB)

# Plotting the results
fig = px.scatter(df_cp_DB, x=df_cp.columns[0], y=df_cp_DB.columns[1], color='cluster', title='DBSCAN Clustering')
fig.show()



# 3.3   Apply Sklearn's Birch clustering. Visualize the created clusters
df_cp_Birch = cleaned_data.copy()
df_cp_Birch.dropna(inplace=True)
df_cp_Birch = one_hot_encode(df_cp_Birch)

# Normalize the dataset
for var in variables:
    df_cp_Birch[var] = lin_norm(df_cp_Birch, var)

# Running the Birch Algorithm
birch = Birch(threshold=0.5, n_clusters=5)
df_cp_Birch['cluster'] = birch.fit_predict(df_cp_Birch)

# Plotting the results
fig = px.scatter(df_cp_Birch, x=df_cp_Birch.columns[0], y=df_cp_Birch.columns[1], color='cluster', title='Birch Clustering')
fig.show()




In [None]:
df.describe()

df_cp_Birch.head()

In [None]:
df_cp.describe()

In [None]:
# 5.1 

def euclidean_distance(x, y):
    sum = 0
    for i in range(len(x)):
        sum += (x[i] - y[i]) ** 2
    return math.sqrt(sum)


# 5.2
def manhatten_distance(x, y):
    sum = 0
    for i in range(len(x)):
        sum += abs(x[i] - y[i])
    return sum


# 5.3
def cosine_similarity(x, y):
    top = 0
    x_summed = 0
    y_summed = 0
    for i in range(len(x)):
        top += x[i] * y[i]
        x_summed += x[i] ** 2
        y_summed += y[i] ** 2
    return top / (math.sqrt(x_summed) * math.sqrt(y_summed))

# 5.4
df_cp_euclidean_DB = df.copy()
df_cp_euclidean_DB.dropna(inplace=True)

df_cp_euclidean_DB = one_hot_encode(df_cp)

for var in variables:
    df_cp_euclidean_DB[var] = lin_norm(df_cp_euclidean_DB, var)



dbscan = DBSCAN(eps=0.5, min_samples=5, metric = euclidean_distance)
df_cp_euclidean_DB['cluster'] = dbscan.fit_predict(df_cp_euclidean_DB)

fig = px.scatter(df_cp_euclidean_DB, x=df_cp_euclidean_DB.columns[0], y=df_cp_euclidean_DB.columns[5], color='cluster', title='DBSCAN Clustering with Euclidean Distance')
fig.show()


In [17]:
#4
#labels = dbscan.labels_
#numerical_data = df.iloc[:, :10]

# Try-out sample, just randomly manually assigned clusters
labels = [1,0,-1,2,0,2,1,-1,-1,1]
numerical_data = df.head(10).iloc[:,:10]

# !QUESTION! is it okay to use the pandas library here?
labels_series = pd.Series(labels, index=numerical_data.index)

# assigned all points to their respective cluster
unique_clusters = set(labels) - {-1}
k_clusters = len(unique_clusters)
cluster_datasets = {}
for cluster in unique_clusters:
  cluster_data = numerical_data[labels_series == cluster]
  cluster_datasets[cluster] = cluster_data

# loop over all points in the same cluster and calculate average distance
def calculate_a(point_i, cluster, cluster_size):
  if cluster_size == 1:
    return 0
  total_distance = 0
  for j in range(len(cluster)):
    if point_i != j:
      total_distance += math.dist(cluster.iloc[point_i], cluster.iloc[j])
  return total_distance / (cluster_size-1)

# loop over all other_clusters, calculate average distance for each point in that other_cluster,
# take minimum distance for all the other_clusters
def calculate_b(point_i, cluster_i, cluster):
  b = float('inf')
  for other_cluster_j in cluster_datasets:
    if other_cluster_j == cluster_i:
      continue
    total_distance = 0
    other_cluster = cluster_datasets[other_cluster_j]
    for j in range(len(other_cluster)):
      total_distance += math.dist(cluster.iloc[point_i], other_cluster.iloc[j])
    avg_distance = total_distance / len(other_cluster)
    b = min(b, avg_distance)
  return b

# calculate silhouette_scores for each point that is in a cluster
silhouette_scores = []
for cluster in cluster_datasets:
  cluster_size = len(cluster_datasets[cluster])
  if cluster_size == 1:
    silhouette_scores.append(0)
    continue
  for point in range(len(cluster_datasets[cluster])):
    a = calculate_a(point, cluster_datasets[cluster], cluster_size)
    b = calculate_b(point, cluster, cluster_datasets[cluster])
    s = (b-a) / (max(a, b))
    silhouette_scores.append(s)

# calculate silhouette coefficient by just taking the average of the silhouette score list
silhouette_coefficient = sum(silhouette_scores) / len(silhouette_scores)
print(silhouette_coefficient)

-0.32190200329369717
