In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import os

def cluster_vg_sales(file_path, max_k=10, eps_values=np.linspace(0.1, 5, 20), min_samples=5):
    """
    Perform clustering on video game sales data using K-Means and DBSCAN.

    Parameters:
    - file_path (str): Path to the CSV file.
    - max_k (int): Maximum number of clusters to evaluate for K-Means.
    - eps_values (array-like): Range of epsilon values to test for DBSCAN.
    - min_samples (int): Minimum number of samples required for a core point in DBSCAN.

    Returns:
    - df (pd.DataFrame): DataFrame with assigned clusters from both algorithms.
    - optimal_k (int): Optimal number of clusters for K-Means based on Sessa Estimator.
    - optimal_eps (float): Optimal epsilon for DBSCAN based on Sessa Estimator.
    - silhouette_kmeans (float): Silhouette score for K-Means.
    - silhouette_dbscan (float or None): Silhouette score for DBSCAN (if valid clusters exist).
    """

    # Load dataset
    df = pd.read_csv(file_path)

    # Filter data up to 2016
    df = df[df['Year'] <= 2016]

    # Selecting relevant columns for clustering
    sales_data = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
    scaler = StandardScaler()
    sales_scaled = scaler.fit_transform(sales_data)

    # Function to calculate Sessa Empirical Estimator using DBSCAN
    def sessa_estimator_dbscan(data, eps_values, min_samples):
        ratios = []
        for eps in eps_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(data)
            unique_labels = set(labels)
            if len(unique_labels) > 1 and -1 in unique_labels:
                unique_labels.remove(-1)  # Remove noise cluster
            if len(unique_labels) > 1:
                within_cluster_var = np.sum([np.var(data[labels == label]) for label in unique_labels])
                between_cluster_var = np.sum((np.mean(data, axis=0) - np.mean(data[labels != -1], axis=0)) ** 2)
                ratios.append(between_cluster_var / within_cluster_var if within_cluster_var != 0 else np.nan)
            else:
                ratios.append(np.nan)
        return ratios

    # Function to calculate Sessa Empirical Estimator using K-Means
    def sessa_estimator_kmeans(data, max_k):
        ratios = []
        for k in range(1, max_k + 1):
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(data)
            within_cluster_var = kmeans.inertia_
            between_cluster_var = np.sum((kmeans.cluster_centers_ - np.mean(data, axis=0)) ** 2)
            ratios.append(between_cluster_var / within_cluster_var)
        return ratios

    # Determine optimal clustering parameters
    sessa_ratios_kmeans = sessa_estimator_kmeans(sales_scaled, max_k)
    sessa_ratios_dbscan = sessa_estimator_dbscan(sales_scaled, eps_values, min_samples)

    optimal_k = np.argmax(sessa_ratios_kmeans) + 1  # Add 1 because k starts at 1
    optimal_eps = eps_values[np.nanargmax(sessa_ratios_dbscan)]

    # Apply K-Means clustering
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    df['KMeans_Cluster'] = kmeans.fit_predict(sales_scaled)

    # Apply DBSCAN clustering
    dbscan = DBSCAN(eps=optimal_eps, min_samples=min_samples)
    df['DBSCAN_Cluster'] = dbscan.fit_predict(sales_scaled)

    # Validate clusters using Silhouette Score
    silhouette_kmeans = silhouette_score(sales_scaled, df['KMeans_Cluster'])

    valid_dbscan_clusters = df['DBSCAN_Cluster'] != -1
    silhouette_dbscan = None
    if len(set(df['DBSCAN_Cluster'][valid_dbscan_clusters])) > 1:
        silhouette_dbscan = silhouette_score(sales_scaled[valid_dbscan_clusters], df['DBSCAN_Cluster'][valid_dbscan_clusters])

    return df, optimal_k, optimal_eps, silhouette_kmeans, silhouette_dbscan

In [None]:
file_path = os.path.join(os.path.dirname(os.getcwd()), "dataset", "vgsales.csv")
df_clustered, optimal_k, optimal_eps, silhouette_kmeans, silhouette_dbscan = cluster_vg_sales(file_path)

print(f"Optimal K (K-Means): {optimal_k}")
print(f"Optimal Epsilon (DBSCAN): {optimal_eps}")
print(f"Silhouette Score (K-Means): {silhouette_kmeans:.2f}")

if silhouette_dbscan is not None:
    print(f"Silhouette Score (DBSCAN): {silhouette_dbscan:.2f}")
else:
    print("Silhouette Score for DBSCAN not available (only one valid cluster).")

# Display the first few rows of the clustered data
df_clustered.head()


Optimal K (K-Means): 10
Optimal Epsilon (DBSCAN): 0.1
Silhouette Score (K-Means): 0.65
Silhouette Score (DBSCAN): 0.29


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,KMeans_Cluster,DBSCAN_Cluster
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,4,-1
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,8,-1
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,2,-1
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,2,-1
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,2,-1
