## Aspect Based Sentiment Analysis

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import torch.nn.functional as F
import gc

In [None]:
# Code chunk uses developed Steam review scraper and creates a file
#from WebScraping import scrape_steam_game

#scraper = scrape_steam_game(game_id = 2358720)
#reviews_df = scraper.scrape_review_info(is_json = False)
#reviews_df.to_csv("Black_Myth_data.csv")

In [None]:
# Code chunk used to import file in Google Colab

# from google.colab import files
# uploaded = files.upload()

In [None]:
reviews_df = pd.read_csv("Black_Myth_data.csv")
reviews_df["review"].head()

In [None]:
# Create the Term-Document Matrix (or TF-IDF matrix)
vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.95, min_df = 2)
tfidf_matrix = vectorizer.fit_transform(reviews_df['review'])
feature_names = vectorizer.get_feature_names_out()

In [None]:
# Create seed words for words relevant to aspects
aspect_seed_words = {
    'gameplay': [
        'gameplay', 'controls', 'mechanics', 'playing', 'combat', 'puzzle', 'challenge',
        'actions', 'movement', 'abilities', 'skills', 'weapons', 'armor', 'crafting',
        'progression', 'exploration', 'loot', 'flow', 'pace', 'loop',
        'strategy', 'tactics', 'difficulty', 'grind', 'replayability', 'fun', 'satisfying',
        'grinding', 'perks', 'upgrades', 'economy', 'resources', 'interaction',
        'decision', 'choice', 'sandbox', 'survival'],
    'graphics': [
        'graphics', 'visuals', 'art', 'stunning', 'beautiful', 'fidelity', 'textures',
        'models', 'lighting', 'animations', 'style', 'aesthetic', 'design', 'look',
        'appearance', 'vibrant', 'detailed', 'environments', 'shader', 'particles'],
    'story': [
        'story', 'narrative', 'plot', 'characters', 'lore', 'dialogue', 'writing', 'twist',
        'ending', 'protagonist', 'antagonist', 'cutscenes', 'meaningful',
        'emotional', 'depth', 'background', 'quests', 'backstory', 'theme'],
    'audio': [
        'sound', 'music', 'audio', 'voice', 'immersive', 'soundtrack', 'effects',
        'score', 'ambience', 'voiceovers', 'ear', 'mix'],
    'performance': [
        'performance', 'fps', 'lag', 'runs', 'smooth', 'crashes', 'stutter', 'frames',
        'optimization', 'stable', 'glitchy', 'loading', 'buggy', 'issues',
        'technical', 'stability', 'hardware', 'specifications', 'overheating', 'slowdown'],
    'multiplayer': [
        'multiplayer', 'online', 'pvp', 'community', 'versus', 'competitive', 'team', 'social', 'friends',
        'clan', 'guild', 'latency', 'ping', 'leaderboards'],
    'level_design': [
        'level', 'design', 'maps', 'environments', 'layout', 'boss', 'unlock', 'achievement',
        'explore', 'area', 'zone', 'dungeon', 'world', 'pacing', 'structure', 'flow',
        'secrets', 'paths', 'traps', 'obstacles', 'variety', 'verticality', 'landmarks',
        'sprawling', 'linear', 'hub', 'missions', 'objectives', 'quests',
        'progression', 'landmarks', 'scale', 'architecture', 'space', 'respawn', 'balance', 'fairness'],
    'bugs': [
        'bugs', 'flaw', 'buggy', 'glitches', 'error', 'crashes', 'broken', 'unplayable',
        'issues', 'problems', 'fixes', 'patch', 'update', 'defects', 'stability', 'corrupt']
}
n_aspects = len(aspect_seed_words)

In [None]:
# Create the Guiding Matrix
H_init = np.zeros((n_aspects, len(feature_names)))
vocab = {word: i for i, word in enumerate(feature_names)}

# Strength here encourages the NMF to associate seed words strongly with their respective topics
seed_strength = 10.0
default_strength = 0.1

for i, (aspect, seeds) in enumerate(aspect_seed_words.items()):
    for seed_word in seeds:
        if seed_word in vocab:
            H_init[i, vocab[seed_word]] = seed_strength
        else:
            print(f"Warning: Seed word '{seed_word}' for aspect '{aspect}' not in TF-IDF vocabulary.")

In [None]:
# Initialize NMF with the Guiding Matrix
nmf_model = NMF(
    n_components = n_aspects,
    init = 'custom',
    solver = 'mu',
    beta_loss = 'frobenius',
    max_iter = 10000,
    random_state = 42,
    alpha_W = 0.0,
    alpha_H = 0.0
)

In [None]:
# Create random W_init
W_init = np.random.rand(tfidf_matrix.shape[0], n_aspects) * 0.1

W = nmf_model.fit_transform(tfidf_matrix, W = W_init, H = H_init)
H = nmf_model.components_
learned_aspect_names = list(aspect_seed_words.keys())

In [None]:
# Extract top aspects for the set categories
def get_top_terms_for_aspects(seed_words_for_category,
                              H_matrix,
                              feature_names_list,
                              all_learned_component_names,
                              n_top = 5):
    top_terms = {}
    for i, component in enumerate(H_matrix):
        top_indices = component.argsort()[-n_top:][::-1]
        top_terms[all_learned_component_names[i]] = [feature_names_list[index] for index in top_indices]
    return top_terms

aspect_top_terms = get_top_terms_for_aspects(aspect_seed_words, H, feature_names, learned_aspect_names)

In [None]:
# Print out the aspects
for aspect, terms in aspect_top_terms.items():
    print(f"Top terms for {aspect}: {terms}")

In [None]:
# Load Aspect-Based Sentiment Analysis model
absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification \
  .from_pretrained("yangheng/deberta-v3-base-absa-v1.1")

# Load a traditional Sentiment Analysis model
sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_model = pipeline("sentiment-analysis", model=sentiment_model_path,
                          tokenizer=sentiment_model_path)

In [None]:
# Detect machine capabilities
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")
absa_model.to(device)

In [None]:
def analyze_review_by_category(review_text, aspect_top_terms_map, tokenizer, model, current_device, internal_batch_size = 16, tokenizer_max_length = 512):
    """
    Analyzes a single review to determine sentiment for predefined aspect categories
    based on the guided NMF top terms.

    Args:
        review_text: The Steam game review
        aspect_top_terms_map: Dictionary where keys are aspect categories and values 
                                     are lists of top NMF terms for that category.
                                     Example: {'gameplay': ['mechanics', 'controls'], ...}
        tokenizer: The ABSA tokenizer.
        model: The ABSA model.
        current_device: The device to run the model on ('mps', 'cuda', or 'cpu').
        internal_batch_size: How many (sentence, term) pairs to process at once by the model.

    Returns:
        pd.Series: A Series with aspect categories as index and predicted sentiment
                   ('positive', 'negative', or 'neutral' if terms are found, else None).
    """
    model_inference_inputs = []

    for aspect_category_name, nmf_terms in aspect_top_terms_map.items():
        for nmf_term in nmf_terms:
            if nmf_term.lower() in review_text.lower():
                model_inference_inputs.append((review_text, nmf_term, aspect_category_name))

    if not model_inference_inputs:
        return pd.Series({category: None for category in aspect_top_terms_map.keys()}, dtype=object)

    all_term_sentiments_data = []

    for i in range(0, len(model_inference_inputs), internal_batch_size):
        current_batch_segment = model_inference_inputs[i:i+internal_batch_size]

        sentences_for_model_batch = [item[0] for item in current_batch_segment]
        terms_for_model_batch = [item[1] for item in current_batch_segment]
        original_categories_in_batch = [item[2] for item in current_batch_segment]

        encoded_batch = tokenizer(
            sentences_for_model_batch,
            terms_for_model_batch,
            return_tensors="pt",
            padding = True,
            truncation = True,
            max_length = tokenizer_max_length
        )
        encoded_batch = {k: v.to(current_device) for k, v in encoded_batch.items()}

        with torch.no_grad():
            outputs = model(**encoded_batch)
            # Probabilities for [negative, neutral, positive]
            probabilities_batch = F.softmax(outputs.logits, dim=1).cpu().numpy()

        for j in range(len(probabilities_batch)):
            all_term_sentiments_data.append({
                "original_category": original_categories_in_batch[j],
                "term_sentiment_probs": probabilities_batch[j]
            })

    # Aggregate sentiments for each original aspect category
    final_category_sentiments = {}
    for category_name_key in aspect_top_terms_map.keys():
        # Get all probability arrays for terms belonging to this category
        probs_for_this_category = [
            data["term_sentiment_probs"] for data in all_term_sentiments_data
            if data["original_category"] == category_name_key
        ]

        if not probs_for_this_category:
            final_category_sentiments[category_name_key] = None # No terms for this category found/analyzed
            continue

        # Average the probabilities across all found terms for this category
        avg_probs_for_category = np.mean(np.array(probs_for_this_category), axis=0)

        neg_score = avg_probs_for_category[0]
        # neutral_score = avg_probs_for_category[1] # Not directly used for binary decision but can be insightful
        pos_score = avg_probs_for_category[2]

        # Determine final positive/negative prediction
        if pos_score > neg_score:
            final_sentiment = "positive"
        elif neg_score > pos_score:
            final_sentiment = "negative"
        else:
            final_sentiment_idx = np.argmax(avg_probs_for_category) # 0:neg, 1:neu, 2:pos
            class_labels = ["negative", "neutral", "positive"]
            final_sentiment = class_labels[final_sentiment_idx]


        final_category_sentiments[category_name_key] = final_sentiment

    return pd.Series(final_category_sentiments, dtype = object)

In [None]:
# Parameters that can be changed depending on computer capacity
chunk_size = 5
internal_model_batch_size = 4
tokenizer_max_len = 256

all_results_list = []

print(f"Using device: {device}")
print(f"DataFrame chunk_size: {chunk_size}")
print(f"Internal model batch_size: {internal_model_batch_size}")
print(f"Tokenizer max_length: {tokenizer_max_len}")

# For loop to process sentiment in chunks
for i in range(0, len(reviews_df), chunk_size):
    review_chunk = reviews_df['review'][i:i + chunk_size]

    current_chunk_results = []
    for review_text in review_chunk: # Process one review at a time from the small chunk
        sentiments = analyze_review_by_category(
            review_text,
            aspect_top_terms,
            absa_tokenizer,
            absa_model,
            device, # Pass the torch.device object
            internal_batch_size = internal_model_batch_size
        )
        current_chunk_results.append(sentiments)

    # Convert list of Series to DataFrame for this chunk
    if current_chunk_results:
        chunk_sentiments_df = pd.DataFrame(current_chunk_results)
        all_results_list.append(chunk_sentiments_df)
    
    # Explicitly try to free memory after each major chunk
    if device.type == 'mps':
        torch.mps.empty_cache()
    gc.collect()

    print(f"Processed chunk {i//chunk_size + 1} of {(len(reviews_df)-1)//chunk_size + 1}")
    
if all_results_list:
    df_final_aspect_sentiments = pd.concat(all_results_list)
else:
    empty_cols = list(aspect_top_terms.keys())
    df_final_aspect_sentiments = pd.DataFrame(columns = empty_cols)

df_final_aspect_sentiments.head()

In [None]:
reviews_df = reviews_df.reset_index(drop = True)
df_final_aspect_sentiments = df_final_aspect_sentiments.reset_index(drop = True)
df = pd.concat([reviews_df['review'], df_final_aspect_sentiments], axis = 1)
df.head(100)

In [None]:
# --- Aggregate sentiments for each category ---
aggregated_category_sentiments = {}

aspect_categories = df_final_aspect_sentiments.columns

for category in aspect_categories:
    # Count the occurrences of each sentiment, dropping NaN/None values
    sentiment_counts = df_final_aspect_sentiments[category].value_counts(dropna = True)
    
    # Calculate the total number of valid (non-NaN) sentiment entries for this category
    total_valid_reviews = sentiment_counts.sum()

    if total_valid_reviews == 0:
        aggregated_category_sentiments[category] = 'No data'  # Or 'neutral', or None
        continue

    # Calculate percentages
    positive_pct = sentiment_counts.get('positive', 0) / total_valid_reviews
    negative_pct = sentiment_counts.get('negative', 0) / total_valid_reviews
    neutral_pct = sentiment_counts.get('neutral', 0) / total_valid_reviews
    
    if positive_pct > 0.5:
        aggregated_category_sentiments[category] = 'positive'
    elif negative_pct > 0.5:
        aggregated_category_sentiments[category] = 'negative'
    elif neutral_pct > 0.5: # If neutral opinions form a clear majority
        aggregated_category_sentiments[category] = 'neutral'
    else:
        # Fallback: If no sentiment has >50%, choose the one with the highest count (plurality)
        if not sentiment_counts.empty:
            most_frequent_sentiment = sentiment_counts.index[0]
            aggregated_category_sentiments[category] = most_frequent_sentiment
        else:
            aggregated_category_sentiments[category] = 'No data' 

# Convert the results to a Pandas Series for nice display
overall_sentiments_summary = pd.Series(aggregated_category_sentiments)

print("\nOverall Sentiment Summary for Each Category:")
print(overall_sentiments_summary)