In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import os

def cluster_vg_sales(file_path, max_k=10):
    """
    Perform clustering on video game sales data using K-Means.

    Parameters:
    - file_path (str): Path to the CSV file.
    - max_k (int): Maximum number of clusters to test in Sessa Estimator.

    Returns:
    - df (pd.DataFrame): DataFrame with assigned clusters and labels.
    - optimal_k_sessa (int): Optimal number of clusters found.
    - silhouette_avg (float): Silhouette score for the clustering.
    - cluster_means (pd.DataFrame): Mean sales per cluster.
    """
    
    # Load dataset
    df = pd.read_csv(file_path)

    # Filter data up to 2016 and handle missing values
    df = df[df['Year'] <= 2016]
    df = df.dropna(subset=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'])

    # Selecting relevant columns for clustering
    sales_data = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
    scaler = StandardScaler()
    sales_scaled = scaler.fit_transform(sales_data)

    # Function to calculate Sessa Empirical Estimator
    def sessa_estimator(data, max_k):
        ratios = []
        for k in range(1, max_k + 1):
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(data)
            within_cluster_var = kmeans.inertia_
            between_cluster_var = np.sum((kmeans.cluster_centers_ - np.mean(data, axis=0)) ** 2)
            ratios.append(between_cluster_var / within_cluster_var)
        return ratios

    # Determine optimal number of clusters using Sessa Estimator
    sessa_ratios = sessa_estimator(sales_scaled, max_k)
    optimal_k_sessa = np.argmax(sessa_ratios) + 1  # Add 1 because k starts at 1

    # Apply K-Means Clustering with optimal K
    kmeans = KMeans(n_clusters=optimal_k_sessa, random_state=42, n_init=10)
    df['Cluster'] = kmeans.fit_predict(sales_scaled)

    # Validate clusters using Silhouette Score
    silhouette_avg = silhouette_score(sales_scaled, df['Cluster'])

    # Analyze cluster characteristics
    cluster_means = df.groupby('Cluster')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']].mean()

    # Assign meaningful labels based on sales trends
    cluster_labels = {}
    avg_sales = cluster_means['Global_Sales'].sort_values()
    for i, cluster_id in enumerate(avg_sales.index):
        if i == 0:
            cluster_labels[cluster_id] = "Low Sales"
        elif i == 1:
            cluster_labels[cluster_id] = "Moderate Sales"
        elif i == 2:
            cluster_labels[cluster_id] = "High Sales"
        else:
            cluster_labels[cluster_id] = "Blockbuster Hits"

    df['Cluster_Label'] = df['Cluster'].map(cluster_labels)

    return df, optimal_k_sessa, silhouette_avg, cluster_means


In [2]:
file_path = os.path.join(os.path.dirname(os.getcwd()), "dataset", "vgsales.csv")
df_clustered, optimal_k, silhouette, cluster_means = cluster_vg_sales(file_path)

print(f"Optimal number of clusters: {optimal_k}")
print(f"Silhouette Score: {silhouette}")
print(cluster_means.head())

# Display the first few rows of the clustered data
df_clustered.head()


Optimal number of clusters: 10
Silhouette Score: 0.6536844044838875
          NA_Sales   EU_Sales  JP_Sales  Other_Sales  Global_Sales
Cluster                                                           
0         0.744338   0.419730  0.054052     0.135185      1.353310
1         0.098466   0.043045  0.037479     0.014173      0.193490
2        10.810769   9.090000  4.303077     2.368462     26.573846
3         4.790000   3.713269  0.315385     1.302115     10.119615
4        41.490000  29.020000  3.770000     8.460000     82.740000


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Cluster,Cluster_Label
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74,4,Blockbuster Hits
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,8,Blockbuster Hits
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82,2,Blockbuster Hits
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0,2,Blockbuster Hits
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,2,Blockbuster Hits
