In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import os

def cluster_vg_sales_dbscan(file_path, eps_values=np.linspace(0.1, 5, 20), min_samples=5):
    """
    Perform clustering on video game sales data using DBSCAN.

    Parameters:
    - file_path (str): Path to the CSV file.
    - eps_values (array-like): Range of epsilon values to test for DBSCAN.
    - min_samples (int): Minimum number of samples required for a core point.

    Returns:
    - df (pd.DataFrame): DataFrame with assigned clusters and labels.
    - optimal_eps (float): Best epsilon found using Sessa Estimator.
    - silhouette_avg (float or None): Silhouette score (if valid clusters exist).
    - cluster_means (pd.DataFrame): Mean sales per cluster.
    """
    
    # Load dataset
    df = pd.read_csv(file_path)

    # Filter data up to 2016
    df = df[df['Year'] <= 2016]

    # Selecting relevant columns for clustering
    sales_data = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
    scaler = StandardScaler()
    sales_scaled = scaler.fit_transform(sales_data)

    # Function to calculate Sessa Empirical Estimator using DBSCAN
    def sessa_estimator_dbscan(data, eps_values, min_samples):
        ratios = []
        for eps in eps_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(data)
            unique_labels = set(labels)
            if len(unique_labels) > 1 and -1 in unique_labels:
                unique_labels.remove(-1)  # Remove noise cluster
            if len(unique_labels) > 1:
                within_cluster_var = np.sum([np.var(data[labels == label]) for label in unique_labels])
                between_cluster_var = np.sum((np.mean(data, axis=0) - np.mean(data[labels != -1], axis=0)) ** 2)
                ratios.append(between_cluster_var / within_cluster_var if within_cluster_var != 0 else np.nan)
            else:
                ratios.append(np.nan)
        return ratios

    # Determine optimal epsilon using Sessa Estimator
    sessa_ratios = sessa_estimator_dbscan(sales_scaled, eps_values, min_samples)
    optimal_eps = eps_values[np.nanargmax(sessa_ratios)]

    # Apply DBSCAN with optimal epsilon
    dbscan = DBSCAN(eps=optimal_eps, min_samples=min_samples)
    df['Cluster'] = dbscan.fit_predict(sales_scaled)

    # Validate clusters (excluding noise)
    valid_clusters = df['Cluster'] != -1
    silhouette_avg = None
    if len(set(df['Cluster'][valid_clusters])) > 1:
        silhouette_avg = silhouette_score(sales_scaled[valid_clusters], df['Cluster'][valid_clusters])

    # Analyze cluster characteristics
    cluster_means = df.groupby('Cluster')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']].mean()

    # Assigning meaningful labels based on sales trends
    cluster_labels = {-1: "Noise"}
    avg_sales = cluster_means['Global_Sales'].sort_values()
    for i, cluster_id in enumerate(avg_sales.index):
        if cluster_id == -1:
            continue
        if i == 0:
            cluster_labels[cluster_id] = "Low Sales"
        elif i == 1:
            cluster_labels[cluster_id] = "Moderate Sales"
        elif i == 2:
            cluster_labels[cluster_id] = "High Sales"
        else:
            cluster_labels[cluster_id] = "Blockbuster Hits"

    df['Cluster_Label'] = df['Cluster'].map(cluster_labels)

    return df, optimal_eps, silhouette_avg, cluster_means

In [None]:
file_path = os.path.join(os.path.dirname(os.getcwd()), "dataset", "vgsales.csv")
df_clustered, optimal_eps, silhouette, cluster_means = cluster_vg_sales_dbscan(file_path)

print(f"Optimal epsilon: {optimal_eps}")
if silhouette is not None:
    print(f"Silhouette Score: {silhouette:.2f}")
else:
    print("Silhouette Score not available (only one valid cluster).")

print(cluster_means.head())

# Display the first few rows of the clustered data
df_clustered.head()


Optimal epsilon: 0.1
Silhouette Score: 0.29
         NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales
Cluster                                                         
-1       1.187143  0.743151  0.350615     0.246759      2.527717
 0       0.819286  0.639286  0.001429     0.212143      1.672857
 1       0.113318  0.050938  0.034193     0.016006      0.214773
 2       1.421429  0.050000  0.000000     0.035714      1.507143
 3       0.824000  0.518000  0.004000     0.138000      1.482000


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Cluster,Cluster_Label
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,-1,Noise
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,-1,Noise
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,-1,Noise
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,-1,Noise
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,-1,Noise
