Read CSV into the notebook and print a summary of the dataset, along with the first five rows to better understand the data structure.

In [19]:
import pandas as pd

online_shopper_intentions_dataset = pd.read_csv('online_shoppers_intention.csv')

# Look at first five rows of the data to get an understanding of it
print(online_shopper_intentions_dataset.head())

# Get a basic summary of the data
print(online_shopper_intentions_dataset.info())

   Administrative  Administrative_Duration  Informational  \
0               0                      0.0              0   
1               0                      0.0              0   
2               0                      0.0              0   
3               0                      0.0              0   
4               0                      0.0              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                     0.0               1                 0.000000   
1                     0.0               2                64.000000   
2                     0.0               1                 0.000000   
3                     0.0               2                 2.666667   
4                     0.0              10               627.500000   

   BounceRates  ExitRates  PageValues  SpecialDay Month  OperatingSystems  \
0         0.20       0.20         0.0         0.0   Feb                 1   
1         0.00       0.10         0.0         0.0   Feb   

Perform data preprocessing to get the data in a form in which K-means and Complete-Linkage Agglomerative nesting can be performed.

In [20]:
# Reset preprocessed dataset to a copy of the original dataset everytime this code is run, this prevents prevents applying mean encoding more than once
online_shopper_intentions_dataset_preprocessed = online_shopper_intentions_dataset.copy()

# Convert weekend and revenue to store 1 and 0s for True and False respectively.
online_shopper_intentions_dataset_preprocessed['Weekend'] = online_shopper_intentions_dataset_preprocessed['Weekend'].replace({True: 1, False: 0})

online_shopper_intentions_dataset_preprocessed['Revenue'] = online_shopper_intentions_dataset_preprocessed['Revenue'].replace({True: 1, False: 0})

# Perform mean encoding for Month and VisitorType
# Calculate mean Revenue for each Month and store mapping in a dictionary
month_to_mean_dictionary = online_shopper_intentions_dataset_preprocessed.groupby(['Month'])['Revenue'].mean().to_dict()

# Ensure mapping matches expectations
print(month_to_mean_dictionary)

# Each month is replaced with its mean revenue using the month to mean dictionary mappping
online_shopper_intentions_dataset_preprocessed['Month'] = online_shopper_intentions_dataset_preprocessed['Month'].replace(month_to_mean_dictionary)

# Calculate mean revenue for each visitor type
visitor_type_to_mean_dictionary = online_shopper_intentions_dataset_preprocessed.groupby(['VisitorType'])['Revenue'].mean().to_dict()

# Ensure mapping matches expectations
print(visitor_type_to_mean_dictionary)

# Each visitor type is replaced with its mean revenue using the visitor type to mean dictionary
online_shopper_intentions_dataset_preprocessed['VisitorType'] = online_shopper_intentions_dataset_preprocessed['VisitorType'].replace(visitor_type_to_mean_dictionary)

# Verify changes were successful
print(online_shopper_intentions_dataset_preprocessed.head()) 
print(online_shopper_intentions_dataset_preprocessed.info())  



{'Aug': 0.17551963048498845, 'Dec': 0.1250723798494499, 'Feb': 0.016304347826086956, 'Jul': 0.1527777777777778, 'June': 0.10069444444444445, 'Mar': 0.10068169900367069, 'May': 0.10850178359096314, 'Nov': 0.25350233488992663, 'Oct': 0.20947176684881602, 'Sep': 0.19196428571428573}
{'New_Visitor': 0.24911452184179456, 'Other': 0.18823529411764706, 'Returning_Visitor': 0.1393232868922377}
   Administrative  Administrative_Duration  Informational  \
0               0                      0.0              0   
1               0                      0.0              0   
2               0                      0.0              0   
3               0                      0.0              0   
4               0                      0.0              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                     0.0               1                 0.000000   
1                     0.0               2                64.000000   
2                     0.0         

Create ground truth clusters and prepare data for training models

In [21]:
# Extract the Revenue column to serve as our ground truth clusters, yes (1) or no (0) respectively.
# Basically, did this shopper lead to a profit.
ground_truth_cluster_labels = online_shopper_intentions_dataset_preprocessed['Revenue'].copy()

# Drop the revenue column from our preprocessed dataset so we do not use it during clustering
online_shopper_intentions_dataset_preprocessed_features = online_shopper_intentions_dataset_preprocessed.drop('Revenue', axis=1)

# Verify changes were successful
print(online_shopper_intentions_dataset_preprocessed_features.head()) 
print(online_shopper_intentions_dataset_preprocessed_features.info())  


   Administrative  Administrative_Duration  Informational  \
0               0                      0.0              0   
1               0                      0.0              0   
2               0                      0.0              0   
3               0                      0.0              0   
4               0                      0.0              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                     0.0               1                 0.000000   
1                     0.0               2                64.000000   
2                     0.0               1                 0.000000   
3                     0.0               2                 2.666667   
4                     0.0              10               627.500000   

   BounceRates  ExitRates  PageValues  SpecialDay     Month  OperatingSystems  \
0         0.20       0.20         0.0         0.0  0.016304                 1   
1         0.00       0.10         0.0         0.0 

Train the K means model and predict clusters

In [22]:
from sklearn.cluster import KMeans

# Initialize a k means model with 4 clusters, rest of the parameters are left to sklearn defaults
k_means_model = KMeans(n_clusters=4)

# Fit the model to the dataset features and get the predicted clusters
k_means_cluster_labels = k_means_model.fit_predict(online_shopper_intentions_dataset_preprocessed_features)

print(f"Clusters predicted by KMeans: \n{k_means_cluster_labels}")

# Store unique cluster values
kmeans_unique_clusters = set(k_means_cluster_labels)

# Display the unique clusters
print(kmeans_unique_clusters)

  super()._check_params_vs_input(X, default_n_init=10)


Clusters predicted by KMeans: 
[0 0 0 ... 0 0 0]
{0, 1, 2, 3}


Train the Complete Linkage Agglomerative Clustering model and predict clusters

In [23]:
from sklearn.cluster import AgglomerativeClustering

# Initialize an agglomerative clustering model using 4 clusters, and complete linkage criterion. 
complete_linkage_agglomerative_clustering_model = AgglomerativeClustering(n_clusters=4, linkage='complete')

# Fit the model to the dataset features and get the predicted clusters
complete_linkage_agglomerative_clustering_model_cluster_labels = complete_linkage_agglomerative_clustering_model.fit_predict(online_shopper_intentions_dataset_preprocessed_features)

# Preview structure of the labels
print(f"Clusters predicted by Complete Linkage Agglomerative Clustering: \n{complete_linkage_agglomerative_clustering_model_cluster_labels}")

# Store unique cluster values
complete_linkage_agglomerative_clustering_unique_clusters = set(complete_linkage_agglomerative_clustering_model_cluster_labels)

# Display the unique clusters
print(complete_linkage_agglomerative_clustering_unique_clusters)


Clusters predicted by Complete Linkage Agglomerative Clustering: 
[0 0 0 ... 0 0 0]
{0, 1, 2, 3}


Using the Rand Index metric, measure the performance of both KMeans and Complete Linkage Agglomerative Clustering by comparing their clusterings to the ground truth clustering given by Revenue.

In [24]:
from sklearn.metrics import rand_score

# Get the KMeans Rand Index
k_means_model_r_index = rand_score(ground_truth_cluster_labels, k_means_cluster_labels)

print(f"K Means Rand Index: {k_means_model_r_index}")

# Get the Complete Linkage Agglomerative Clustering Rand index
complete_linkage_agglomerative_clustering_model_r_index = rand_score(ground_truth_cluster_labels, complete_linkage_agglomerative_clustering_model_cluster_labels)

print(f"Complete Linkage Agglomerative Clustering Rand Index: {complete_linkage_agglomerative_clustering_model_r_index}")

K Means Rand Index: 0.614881469829243
Complete Linkage Agglomerative Clustering Rand Index: 0.7377097378266066
