In [1]:
import pandas as pd
import numpy as np
import folium
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Target Upcurve/Project/DC_Planning.csv')

In [3]:
df['latitude'] = df['geographic_loc'].str.extract(r'\((.*?),')[0].astype(float)
df['longitude'] = df['geographic_loc'].str.extract(r',(.*)\)', expand=False).astype(float)
df.head(5)

Unnamed: 0,guest_id,geographic_loc,order_frequency,avg_order_value,Electronics,Clothing,Home Appliances,Books,Beauty,returns_rate,guest_segmentation,shipping_preferences,promotional_response,state,country,latitude,longitude
0,G50029,"(41.9345228822601,-114.793454289252)",12.528133,9.092696,1,0,0,0,1,0.091774,Lapsed,Express,Medium,Nevada,United States,41.934523,-114.793454
1,G39532,"(28.4114060271366,-98.0530295882411)",17.571661,18.851202,1,1,0,0,0,0.300096,Repeat,Express,Low,Texas,United States,28.411406,-98.05303
2,G93168,"(37.8619825697659,-79.3247968858362)",9.699606,47.468598,1,1,0,0,1,0.58067,Lapsed,Express,Low,Virginia,United States,37.861983,-79.324797
3,G34201,"(33.1011843427186,-115.570115912097)",3.407187,10.747374,0,0,1,0,1,0.526972,New,Same-day,Medium,California,United States,33.101184,-115.570116
4,G69749,"(49.0223887139561,-123.155574774425)",14.276042,3.44174,1,1,1,0,0,0.562618,Repeat,Express,Low,British Columbia,Canada,49.022389,-123.155575


In [9]:
X = df[['latitude', 'longitude']].values

In [10]:
silhouette_scores = []

# Explore different DBSCAN parameter values
eps_values = [0.1, 0.5, 1.0]
min_samples_values = [5, 10, 20]

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X)

        valid_labels = labels[labels != -1]
        if len(valid_labels) > 1:
        # Calculate silhouette score
          silhouette_avg = silhouette_score(X, labels)
          silhouette_scores.append((eps, min_samples, silhouette_avg))

# Find the best parameter combination with the highest silhouette score
best_params = max(silhouette_scores, key=lambda x: x[2])
print("Best Parameters (eps, min_samples, silhouette_score):", best_params)

Best Parameters (eps, min_samples, silhouette_score): (0.5, 20, 0.22447041771090426)


In [11]:
best_eps, best_min_samples, best_silhouette_score = best_params

print("Best Parameters:")
print(f"- eps: {best_eps}")
print(f"- min_samples: {best_min_samples}")
print(f"- silhouette_score: {best_silhouette_score}")

Best Parameters:
- eps: 0.5
- min_samples: 20
- silhouette_score: 0.22447041771090426


In [12]:
eps = 0.5
min_samples = 10
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
df['cluster'] = dbscan.fit_predict(X)

In [23]:
# Create a map to visualize the clusters
m = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()], zoom_start=5)

# Add markers for DBSCAN clusters
for cluster in df['cluster'].unique():
    cluster_data = df[df['cluster'] == cluster]
    cluster_center = [cluster_data['latitude'].mean(), cluster_data['longitude'].mean()]

    marker = folium.Marker(
        location=cluster_center,
        popup=f'DBSCAN Cluster {cluster}',
        icon=folium.Icon(color='blue')
    )
    marker.add_to(m)

# Save the map as an HTML file
m.save('dbscan_clusters_map.html')

In [24]:
m

In [14]:
#  Find cluster labels
unique_labels = np.unique(labels)

# Calculate cluster centroids
cluster_centroids = []
for label in unique_labels:
    if label == -1:
        continue  # Skip noise points
    cluster_points = X[labels == label]
    centroid = np.mean(cluster_points, axis=0)
    cluster_centroids.append(centroid)

# cluster_centroids now contains the representative points for each cluster
print(cluster_centroids)

[array([ 39.29060095, -97.96659629])]


In [19]:
cols = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
        '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
        '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
        '#000075', '#808080']*10
def create_map(df, cluster_column):
  m = folium.Map(location=[df.latitude.mean(), df.longitude.mean()], zoom_start=9, tiles='OpenStreet Map')
  for _, row in df.iterrows():
    if row[cluster_column] == -1:
      cluster_colour = '#000000'
    else:
      cluster_colour = cols[row[cluster_column]]
    folium.CircleMarker(
              location= [row['latitude'], row['longitude']],
              radius=5,
              popup= row[cluster_column],
              color=cluster_colour,
              fill=True,
              fill_color=cluster_colour
          ).add_to(m)

  return m

In [21]:
m = create_map(df,'cluster')


In [22]:
m