# Model Clustering

### Model: Sklearn Latent Dirichlet Allocation (LDA)

In [None]:
# I will use Latent Dirichlet Allocation (LDA) for clustering.
# The goal is to reduce the 23 game categories (genres) to meaningful clusters.
# LDA will analyze the textual data (features, description, genre) to identify hidden topics,
# which will serve as the main clusters for these game genres ('category').

In [None]:
# Imports

import re
import spacy
# import nltk
import pyLDAvis.lda_model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from matplotlib.colors import ListedColormap

from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Reset to show all rows

pd.reset_option('display.max_rows')

### Load and Set up Game Data

In [None]:
lda_df = pd.read_csv('../data/meta_cleaned.csv')

lda_df.sample()

In [None]:
# Combine columns into a single text field for LDA

lda_df = lda_df[['parent_asin', 'features', 'description', 'category']]

lda_df['lda_text'] = lda_df['features'] + ' ' + lda_df['description'] + ' ' + lda_df['category']

lda_df.sample()

In [None]:
print(lda_df.iloc[2351]['features'], '\n')
print(lda_df.iloc[2351]['description'], '\n')
print(lda_df.iloc[2351]['category'], '\n')
print('LDA STRING >>>', lda_df.iloc[2351]['lda_text'])

In [None]:
# Keep only the concatenated column for LDA input

lda_df = lda_df[['parent_asin', 'lda_text']]

print(lda_df.sample()['lda_text'].iloc[0])

lda_df.sample()

### Data Preparation

In [None]:
# Lowercase Conversion

lda_df['lda_text'] = lda_df['lda_text'].str.lower()

print(lda_df.sample()['lda_text'].iloc[0])

In [None]:
# Remove punctuation, numbers, special characters, symbols, etc.

def keep_only_text(text):
    cleaned_text = re.sub(r'[^a-z\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Spaces
    return cleaned_text

lda_df['lda_text'] = lda_df['lda_text'].apply(keep_only_text)

print(lda_df.iloc[2351]['lda_text'])

In [None]:
# Apply Lemmatization using SpaCy

# Load english model
nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

lda_df['lda_text'] = lda_df['lda_text'].apply(lemmatize_text)

print(lda_df.iloc[2351]['lda_text'])

In [None]:
# Remove Stopwords using NLTK

# nltk.download('stopwords')

# Load stopword list
stop_words = set(stopwords.words('english'))

# Add custom stopwords
custom_stopwords = ['game', 'new', 'player', 'play', 'include', 'world', 'one', 'take', 
                    'feature', 'mode', 'use', 'und', 'die', 'sie', 'der', 'experience', 
                    'original', 'nintendo', 'switch', 'lego', 'character', 'gameplay', 
                    'get', 'edition', 'set', 'unique', 'nba', 'make', 'super', 'time', 
                    'ultimate', 'epic', 'system', 'version', 'call', 'good', 'friend', 
                    'like', 'create', 'way', 'content', 'year', 'fun', 'series', 'first', 
                    'creed', 'wwe', 'duty', 'resident', 'assassin', 'dragon', 'pack', 
                    'street', 'fighter', 'classic', 'three', 'gb', 'move']

stop_words.update(custom_stopwords)

def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

lda_df['lda_text'] = lda_df['lda_text'].apply(remove_stopwords)

print(lda_df.iloc[2351]['lda_text'])

### Create Bag-of-Words

In [None]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

corpus = lda_df['lda_text']

# Fit and transform the text data
bow_matrix = vectorizer.fit_transform(corpus)

print('Vocabulary:', vectorizer.get_feature_names_out())
print('Sparse Matrix Shape:', bow_matrix.shape)
print('Sparse Matrix Sample:', bow_matrix.toarray()[0])

### Set up LDA model

In [None]:
# Number of topics
n_topics = 4

# Initialize LDA model
lda_model = LatentDirichletAllocation(n_components = n_topics, random_state = 42)

# Fit LDA model to BoW matrix
lda_model.fit(bow_matrix)

print('LDA model fitted with >>>', n_topics, '<<< topics!')

### Analyze the Topics

In [None]:
# Extract the word distributions for each topic
topic_word_distributions = lda_model.components_ 

# Get the word list from the vectorizer
vocabulary = vectorizer.get_feature_names_out()

# Get top words for each topic
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        # Get indices of the top words
        top_word_indices = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_word_indices]
        # Print top words
        print(' '.join(top_words), '\n')

print_top_words(lda_model, vocabulary, n_top_words = 10)

### LDA: 4 Topics proven best

In [None]:
# Topic #1: Combat-Focused Gameplay
# Represents games centered on battles, warfare, and multiplayer combat.
# Includes FPS, tactical shooters, and MOBAs, with a focus on defeating enemies and teamwork.

# Topic #2: Engaging Simulated Worlds
# Encompasses games involving sports, racing, and team challenges, 
# as well as simulation games like life management or vehicle-based gameplay.

# Topic #3: Action and Tactical Strategy
# Covers games with action-packed combat, exploration, and abilities, 
# while also including strategic games where planning and tactical execution are essential.

# Topic #4: Open Worlds and Discovery
# Reflects games with a focus on story-driven adventures, open-world exploration, 
# life simulation, survival, or sandbox-style gameplay.

### Visualize LDA with t-SNE

In [None]:
# Generate the document-topic distribution matrix
doc_topic_matrix = lda_model.transform(bow_matrix)

# Reduce dimensionality using t-SNE
tsne = TSNE(n_components = 2, random_state = 42, perplexity = 30)
tsne_results = tsne.fit_transform(doc_topic_matrix)

# Identify the dominant topic for each document
dominant_topics = np.argmax(doc_topic_matrix, axis = 1)

# Get the unique topics present in the data
unique_topics = np.unique(dominant_topics)

# Create a custom colormap
topic_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
custom_palette = ListedColormap(topic_colors[:len(unique_topics)])

# Scatter plot
plt.figure(figsize = (7, 5))

scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], 
                       c = dominant_topics, cmap = custom_palette, alpha = 0.7)

handles = [plt.Line2D([0], [0], marker='o', color = topic_colors[i], markersize = 10, linestyle = '') 
           for i in unique_topics]

labels = [f"Topic {i + 1}" for i in unique_topics]

plt.legend(handles, labels, title = 'Topics', loc = 'best')
plt.title('Clustered Video Game Genres')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')

plt.show()

### Visualize LDA with pyLDAvis

In [None]:
# Convert LDA model outputs for pyLDAvis
doc_lengths = np.array(bow_matrix.sum(axis = 1)).flatten()
term_frequency = np.array(bow_matrix.sum(axis = 0)).flatten()

# Normalize topic-term and document-topic distributions
topic_term_dists = lda_model.components_ / lda_model.components_.sum(axis = 1)[:, np.newaxis]
doc_topic_dists = lda_model.transform(bow_matrix)

# Prepare visualization data
lda_vis_data = pyLDAvis.prepare(
    topic_term_dists = topic_term_dists,
    doc_topic_dists = doc_topic_dists,
    doc_lengths = doc_lengths,
    vocab = vocabulary,
    term_frequency = term_frequency,
    n_jobs = 1  # Disable parallelism
)

# Display the visualization
pyLDAvis.display(lda_vis_data)

### Add Topic Information to Dataset

In [None]:
# Topic Dictionary
topics = {0: 'Combat-Focused Gameplay', 
          1: 'Engaging Simulated Worlds', 
          2: 'Action and Tactical Strategy', 
          3: 'Open Worlds and Discovery'}

# Get the topic probabilities for each game
topic_probabilities = lda_model.transform(bow_matrix)

# # Assign each game to the most likely topic
assigned_topics = topic_probabilities.argmax(axis = 1)

# Add the topic assignment to the dataset
lda_df['topic'] = [topics[num] for num in assigned_topics]

lda_df.drop('lda_text', axis = 1, inplace = True)

lda_df.head()

### Save LDA Dataset

In [None]:
# Reset index

lda_df.reset_index(inplace = True, drop = True)

# Save the dataset with LDA results

lda_df.to_csv('../data/meta_clustered.csv')