<a href="https://colab.research.google.com/github/Trexroy1010/research/blob/main/steam_analysis_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**LOAD THE DATASET**

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/drive/MyDrive/updated_latest_reviews_fornextresearch.csv")

In [None]:
df.head()

Unnamed: 0,appid,review,voted_up,votes_up,votes_funny,timestamp_created,author_playtime_forever,name,price,release_date,sentiment
0,1938090,very fun,True,0,0,1756917710,551,Call of Duty: Modern Warfare II,6999,,positive
1,1938090,asdfghjk,True,0,0,1756915719,707,Call of Duty: Modern Warfare II,6999,,neutral
2,1938090,game has been dead since cold war and is more ...,False,0,0,1756913382,4109,Call of Duty: Modern Warfare II,6999,,negative
3,1938090,Awesome gameplay,True,0,0,1756912147,5307,Call of Duty: Modern Warfare II,6999,,positive
4,1938090,Shit Game,True,0,0,1756909952,469,Call of Duty: Modern Warfare II,6999,,negative


In [None]:
!pip install bertopic umap hdbscan sentence-transformers


Collecting bertopic
  Downloading bertopic-0.17.4-py3-none-any.whl.metadata (24 kB)
[31mERROR: Could not find a version that satisfies the requirement umap (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for umap[0m[31m
[0m

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116039 entries, 0 to 116038
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   appid                    116039 non-null  int64  
 1   review                   115560 non-null  object 
 2   voted_up                 116039 non-null  bool   
 3   votes_up                 116039 non-null  int64  
 4   votes_funny              116039 non-null  int64  
 5   timestamp_created        116039 non-null  int64  
 6   author_playtime_forever  116039 non-null  int64  
 7   name                     116039 non-null  object 
 8   price                    116039 non-null  int64  
 9   release_date             0 non-null       float64
 10  sentiment                116039 non-null  object 
dtypes: bool(1), float64(1), int64(6), object(3)
memory usage: 9.0+ MB


**DATA MAPPING AND PREPROCESSING**

In [None]:
def price_bin(price):
  if price < 1000:
    return "low"
  elif 1000<price<4000:
    return "mid"
  else:
    return "high"
df['price_bin'] = df['price'].astype(float).apply(price_bin)

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)   # remove non-letters
    words = [w for w in text.split() if w not in stop_words and len(w) > 2]
    return " ".join(words)

df["clean_review"] = df["review"].dropna().apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**TOPIC DETECTION USING BERTopic**

In [None]:
reviews_low  = df[df['price_bin']=="low"]['clean_review'].dropna().sample(50000, random_state=42)
reviews_mid  = df[df['price_bin']=="mid"]['clean_review'].dropna().sample(50000, random_state=42)
reviews_high = df[df['price_bin']=="high"]['clean_review'].dropna().sample(50000, random_state=42)


ValueError: a must be greater than 0 unless no samples are taken

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer


In [None]:
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
def train_topic_model(reviews, label):
  topic_model = BERTopic(embedding_model=embedding_model,min_topic_size=200, verbose =True)
  topics , probs = topic_model.fit_transform(reviews.tolist())
  print(f"Top topics for {label} games : ")
  print(topic_model.get_topic_info().head())
  return topic_model, topics, probs

model_low, topic_low, _ = train_topic_model(reviews_low, "LOW PRICE")
model_mid, topic_mid, _ = train_topic_model(reviews_mid, "MID PRICE")
model_high, topic_high, _ = train_topic_model(reviews_high, "HIGH PRICEE")

**TOPIC DETECTION USING MANUAL LIBRARY**

In [None]:
categories = {
    "graphics": ["graphics", "visuals", "art style", "animation", "fps", "frame rate", "resolution", "texture", "lighting"],
    "gameplay": ["gameplay", "mechanics", "combat", "controls", "movement", "boss fight", "jumping", "exploration", "fighting"],
    "story": ["story", "plot", "narrative", "dialogue", "ending", "cutscene", "character", "writing", "lore"],
    "microtransactions": ["microtransaction", "pay to win", "p2w", "dlc", "lootbox", "mtx", "season pass", "cash grab"],
    "balance": ["balance", "nerf", "buff", "op", "broken", "difficulty", "challenging", "unfair", "grind"],
    "bugs_performance": ["bug", "glitch", "crash", "lag", "stutter", "fps drop", "optimize", "performance", "freeze"],
    "multiplayer": ["multiplayer", "coop", "co-op", "online", "server", "matchmaking", "team", "pvp"],
    "audio_music": ["soundtrack", "music", "audio", "voice acting", "sound effect", "voices"],
    "value_money": ["price", "worth", "value", "expensive", "cheap", "overpriced", "sale", "refund"],
    "progression": ["leveling", "xp", "progression", "skills", "abilities", "talents", "perk tree", "upgrade"],
    "community": ["community", "mods", "steam workshop", "player base", "toxic", "friendly"],
    "immersion": ["immersion", "atmosphere", "world", "environment", "realistic", "roleplay"],
}


import re
from collections import defaultdict

def categorize_review(review):
    review = review.lower()
    hits = []
    for cat, words in categories.items():
        if any(re.search(rf"\b{w}\b", review) for w in words):
            hits.append(cat)
    return hits

df["categories"] = df["review"].dropna().apply(categorize_review)



In [None]:
df.explode("categories")["categories"].value_counts()


In [None]:
df_exploded = df.explode("categories")
category_price = df_exploded.groupby(["price_bin", "categories"]).size().unstack().fillna(0)


In [None]:
sentiment_map = {
    "positive": 5,
    "very positive": 5,
    "mostly positive": 4,
    "mixed": 3,
    "mostly negative": 2,
    "negative": 1,
    "very negative": 1
}

df["numeric_sentiment"] = df["sentiment"].map(sentiment_map)


**TOPIC FINDING RESULTS**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df_exploded.groupby("categories")["numeric_sentiment"].mean().sort_values().plot(kind="barh")
plt.show()


In [None]:
# If 'categories' column contains lists, first explode it
df_exploded = df.explode("categories")

# Then count and plot
df_exploded["categories"].value_counts().sort_values().plot(kind="barh")
plt.xlabel("Number of Reviews")
plt.ylabel("Category")
plt.title("Number of Reviews per Category")
plt.show()


In [None]:
df_exploded = df.explode("categories").dropna(subset=['categories'])


In [None]:
category_price_counts = df_exploded.groupby(["price_bin", "categories"]).size().unstack(fill_value=0)


In [None]:
most_talked_about_category = category_price_counts.idxmax(axis=1)


In [None]:
print("\nMost talked about topic in each price segment:")
print(most_talked_about_category)
print("\n--- Detailed Counts ---")
print(category_price_counts)

In [None]:
print("\n--- Detailed Counts ---")
print(category_price_counts)


ValueError: heatmanp is not a valid plot kind Valid plot kinds: ('line', 'bar', 'barh', 'kde', 'density', 'area', 'hist', 'box', 'pie', 'scatter', 'hexbin')


In [None]:
import pandas as pd

# Assume 'df' DataFrame is already loaded and processed with price bins.

# -----------------------------------------------------------------------------
# You must include the code from our earlier conversations to get to this point.
# It should look something like this:
#
# import re
# import nltk
# from nltk.corpus import stopwords
#
# try:
#     nltk.data.find('corpora/stopwords')
# except nltk.downloader.DownloadError:
#     nltk.download('stopwords')
#
# categories = { ... } # Your dictionary here
#
# def price_bin(price): ...
# def categorize_review(review): ...
#
# df['price_bin'] = df['price'].apply(price_bin)
# df['categories'] = df['review'].apply(categorize_review)
#
# -----------------------------------------------------------------------------

# FIX: Filter out rows where 'categories' is not a list.
# This prevents the TypeError.
df = df[df['categories'].apply(lambda x: isinstance(x, list))]


# -----------------------------------------------------------------------------
# The rest of the code is the same as what you were running.

# Step 1: Calculate the total number of reviews for each price segment.
total_reviews_per_bin = df['price_bin'].value_counts().sort_index()

print("Total number of reviews per price segment:")
print(total_reviews_per_bin)
print("\n")

# Step 2: Calculate the number of reviews that mention a specific topic.
topic_counts = {cat: [] for cat in categories.keys()}
topic_counts['price_bin'] = []

grouped_by_price = df.groupby('price_bin')

for name, group in grouped_by_price:
    topic_counts['price_bin'].append(name)
    review_ids_with_topic = {cat: set() for cat in categories.keys()}

    for _, row in group.iterrows():
        review_id = row.name
        for cat in row['categories']:
            review_ids_with_topic[cat].add(review_id)

    for cat in categories.keys():
        count = len(review_ids_with_topic[cat])
        topic_counts[cat].append(count)

df_reviews_with_topics = pd.DataFrame(topic_counts).set_index('price_bin')

print("Number of reviews that contain each topic (not mentions):")
print(df_reviews_with_topics)
print("\n")

# Step 3: Calculate the final percentage.
df_percentages = df_reviews_with_topics.div(total_reviews_per_bin, axis=0) * 100

print("--- Percentage of Reviews per Price Segment that Mention a Topic ---")
print(df_percentages.round(2))

In [None]:
df["word_count"] = df["clean_review"].apply(lambda x: len(str(x).split()))


In [None]:
# Keep only reviews above threshold
df_filtered = df[df["word_count"] > 100]   # or > 20 if you prefer stricter


In [None]:
import statsmodels.api as sm
import numpy as np

# Encode price_bin categories numerically
price_map = {"low": 1, "mid": 2, "high": 3}
df_filtered["price_num"] = df_filtered["price_bin"].map(price_map)

# Drop NA for regression
data = df_filtered.dropna(subset=["numeric_sentiment", "price_num"])

# Independent variable (price), with constant
X = sm.add_constant(data["price_num"])
y = data["numeric_sentiment"]

# Fit OLS
model = sm.OLS(y, X).fit()
print(model.summary())


In [None]:
df_filtered.info()

In [None]:
df_reg = df_filtered.dropna(subset=["numeric_sentiment"])


In [None]:
print(df_reg[["price_num","author_playtime_forever","votes_up","votes_funny","word_count"]].isna().sum())


In [None]:
# Use your aggregated counts table, e.g. df_topics_count
topic_counts = df_topics_count.sum(axis=0)  # sum over price_bin
topic_counts.sort_values().plot(kind='barh')
plt.xlabel("Number of Reviews")
plt.ylabel("Topic")
plt.title("Number of Reviews per Topic")
plt.show()


In [None]:
df_percentage.plot(kind='bar', stacked=True, figsize=(10,6))
plt.ylabel("Percentage of Reviews")
plt.xlabel("Price Segment")
plt.title("Topic Distribution by Price Segment")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
import seaborn as sns

sns.heatmap(df_percentage, annot=True, fmt=".1f", cmap="YlGnBu")
plt.ylabel("Price Segment")
plt.xlabel("Topic")
plt.title("Percentage of Reviews by Topic and Price Segment")
plt.show()


In [None]:
import statsmodels.api as sm
import numpy as np

# Drop rows where numeric_sentiment is NaN
df_reg = df_filtered.dropna(subset=["numeric_sentiment"])

# Define features
features = ["price_num", "author_playtime_forever", "votes_up", "votes_funny", "word_count"]

X = df_reg[features]
y = df_reg["numeric_sentiment"]

# Just in case: remove infs
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]

# Add constant for OLS
X = sm.add_constant(X)

# Fit model
model = sm.OLS(y, X).fit()
print(model.summary())


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# Select only your independent variables
X = df[["price", "author_playtime_forever", "votes_up", "votes_funny", "word_count"]].dropna()

# Add constant for intercept
X = sm.add_constant(X)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)


In [None]:
df = df[df['numeric_sentiment'] >= 1.5]
def price_group(price):
    if price < 1000:
        return "low"
    elif 1000 <= price < 4000:
        return "mid"
    else:
        return "high"

df['price_group'] = df['price'].astype(int).apply(price_group)
avg_sentiment_tier = df.groupby('price_group')['numeric_sentiment'].mean()
print(avg_sentiment_tier)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
df['price'] = df['price']/100
# Line plot
avg_sentiment_price = df.groupby('price')['numeric_sentiment'].mean()
plt.figure(figsize=(10,5))
plt.plot(avg_sentiment_price.index, avg_sentiment_price.values)
plt.xlabel("Price")
plt.ylabel("Average Numeric Sentiment")
plt.title("Average Sentiment vs Price")
plt.show()

# Violin plot
plt.figure(figsize=(8,6))
sns.violinplot(x='price_group', y='numeric_sentiment', data=df, palette="coolwarm")
plt.title("Distribution of Numeric Sentiment by Price Tier")
plt.ylabel("Numeric Sentiment")
plt.xlabel("Price Tier")
plt.show()
