In [None]:
!pip install bertopic[visualization] sentence-transformers umap-learn hdbscan

In [None]:
!pip install bertopic sentence-transformers

In [None]:
!pip install streamlit pyngrok

In [None]:
#Loading Data 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import bertopic
import os
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sentence_transformers import SentenceTransformer
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import shap
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_selection import chi2
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation
import streamlit as st
import joblib
from pyngrok import ngrok, conf
import threading
import time
import subprocess
nltk.download('wordnet')
csv_path = '/kaggle/input/steam-reviews/dataset.csv'

sample = pd.read_csv(csv_path, nrows=10000)

print("Unique review_score values:")
print(sample['review_score'].unique())

print("\nDescriptive stats:")
print(sample['review_score'].describe())

In [None]:
#Creating Dataframe for Sentiment analysis

steam_df = pd.read_csv(csv_path, chunksize=4000000, low_memory=False).__next__()

In [None]:
#Ultra-Expanded Stopword List

custom_stopwords = set(ENGLISH_STOP_WORDS)

#Extending with large filler, function, and gaming-related terms
custom_stopwords.update({
    #Core filler verbs and common speech words
    'got','get','gets','getting','say','says','said','know','knows','knew','think','thinks','thought',
    'see','seen','saw','seem','seems','seemed','look','looks','looked','looking','felt','feel','feels',
    'make','makes','made','doing','did','done','do','does','dont','didnt','cant','couldnt','wont','wouldnt',
    'shouldnt','isnt','arent','wasnt','werent','am','are','is','be','been','being','have','has','had',
    'having','become','became','becoming','try','tries','tried','trying','want','wants','wanted','think',
    'thought','believe','believes','believed','guess','guessed','guessing','say','saying','told','telling',

    #Pronouns, contractions, and small words
    'i','me','my','mine','myself','we','us','our','ours','ourselves','you','your','yours','yourself','yourselves',
    'he','him','his','himself','she','her','hers','herself','they','them','their','theirs','themselves','it','its','itself',
    'who','whom','whose','which','that','this','these','those','here','there','where','when','why','what','how',
    'any','anything','anyone','every','everything','everyone','something','someone','nothing','nobody',
    'one','ones','two','three','four','five','six','seven','eight','nine','ten','many','much','few','several','some','most','more','less',

    #Modal verbs, determiners, and connectives
    'will','shall','should','could','can','may','might','must','ought','would',
    'and','or','but','because','since','until','while','if','though','although','unless','whether',
    'however','therefore','thus','hence','besides','anyway','anyhow','either','neither','both','each','every',
    'all','another','again','ever','never','always','sometimes','often','usually','rarely','hardly',

    #Adverbs, intensifiers, and softeners
    'just','really','very','pretty','quite','maybe','perhaps','basically','literally','kind','sort','bit',
    'somewhat','somehow','almost','nearly','around','back','still','yet','already','actually','especially',
    'probably','definitely','certainly','clearly','obviously','apparently','surely','simply','mostly','mostly',

    #Interjections & conversational fluff
    'lol','omg','wow','haha','hehe','uh','um','hmm','huh','nah','yeah','yep','nope','okay','ok','alright',
    'right','yup','yo','pls','plz','thank','thanks','thx','ty','np','btw','bruh','bro','dude','guy','guys',

    #Steam/game-domain terms
    'game','games','gaming','steam','player','players','play','played','playing','fun','good','great',
    'awesome','amazing','boring','nice','cool','experience','experiences',
    'enjoy','enjoyed','recommend','recommended','recommendation','review','reviews','positive','negative',
    'early','access','update','updated','version','release','released',
    'price','buy','bought','purchase','money','cheap','sale','discount','deal','content','extra',
    'fix','bug','bugs','buggy','issues','crash','crashes','fps','frame','frames','smooth',
    'lag','slow','fast','visual','visuals','sound','music','audio','voice','story','plot',
    'character','characters','mission','missions','quest','quests','level','levels','map','maps','campaign',

    #Temporal and frequency words
    'time','times','hour','hours','minute','minutes','day','days','week','weeks','month','months','year','years',
    'today','yesterday','tomorrow','recently','soon','now','then','before','after','later','ago','moment','long',
    'short','early','late','end','ended','start','started','begin','began','finish','finished','complete','completed',

    #Generic adjectives (neutral)
    'bad','good','better','best','worst','nice','cool','decent','fine','okay','perfect','greatest',
    'alright','solid','classic','modern','simple','complex','basic','standard','average',
    'new','old','same','different','original','weird','strange','random',
    'interesting','unique','generic','specific',

    #Extra redundant game phrases
    'team','dev','devs','developer','developers','studio','company','community','staff','people',
    'person','everyone','anyone','someone','personally','honestly','basically','literally','seriously',
    'overall','anyway','anyways','again','etc','etcetera','stuff','thing','things','everything','something'
})

def clean_text_ultra(text):
    """Aggressive cleaning for Steam review data."""
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)  # keep letters only
    tokens = [w for w in text.split() if w not in custom_stopwords and len(w) > 2]
    return ' '.join(tokens)

steam_df['clean_text'] = steam_df['review_text'].astype(str).apply(clean_text_ultra)

print(f"✅ Cleaned with ultra stopword list ({len(custom_stopwords)}+ terms)")
print(steam_df['clean_text'].sample(5).tolist())


In [None]:
#Cleaning and dropping nulls
steam_df = steam_df.dropna(subset=['review_text', 'review_score'])

#Creating Sentiment labeling for Reviews
def make_sentiment_label(x):
    #Trying to handle different formats
    if isinstance(x, bool):
        return 1 if x else -1
    try:
        if np.isnan(x):
            return np.nan
        #Normalizing types
        x = float(x)
        if 0 <= x <= 1:
            return 1 if x >= 0.5 else -1
        if x in [0, 1]:
            return 1 if x == 1 else -1
        if -1 <= x <= 1:
            return 1 if x > 0 else -1
        if 1 <= x <= 5:
            return 1 if x >= 3 else -1
    except:
        return np.nan
    return np.nan

#Applying labeling
steam_df['sentiment'] = steam_df['review_score'].apply(make_sentiment_label)
steam_df = steam_df.dropna(subset=['sentiment'])

print("\n✅ Sentiment label distribution:")
print(steam_df['sentiment'].value_counts())


In [None]:
#Base TF-IDF Matrix
text_col = 'clean_text' if 'clean_text' in steam_df.columns else 'review_text'
df = steam_df.dropna(subset=[text_col, 'sentiment']).copy()


vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.9,
    min_df=5,
    ngram_range=(1,2),
    max_features=30000
)
X = vectorizer.fit_transform(steam_df['clean_text'])
y = steam_df['sentiment']

print("✅ Step 1: TF-IDF shape:", X.shape)


In [None]:

#Unique Sentiment Word Filtering(using Chi2)

#Computing chi-square scores 
chi2_scores, p_values= chi2(X, y)
feature_names = np.array(vectorizer.get_feature_names_out())

chi2_df = pd.DataFrame({
    'feature': feature_names,
    'chi2': chi2_scores,
    'p': p_values
}).sort_values(by='chi2', ascending=False)

#Identifying statistically weak words
neutral_words = chi2_df[chi2_df['chi2'] < chi2_df['chi2'].quantile(0.20)]['feature'].tolist()

#Combining with manual filler list
manual_fillers = {
    'like','don','lot','worth','free','way','thing','stuff','maybe','probably','seems','look','feels',
    'need','needed','needs','makes','get','got','good','bad','great','fun','nice','ok','okay','alright',
    'cool','little','time','going','play','experience','enjoy','content','review','recommend',
    'buy','worth','money','pay','paid','cheap','version','feature','features'
}

#Merging both sets
filler_set = set(neutral_words).union(manual_fillers)

print(f"🧹 Removing {len(filler_set)} neutral/filler words from vocabulary.")


In [None]:
#Building first filtered TF-IDF with only unique words
filtered_vocab = [w for w in feature_names if w not in filler_set]

vectorizer_filtered = TfidfVectorizer(
    stop_words='english',
    vocabulary=filtered_vocab
)
X_filtered = vectorizer_filtered.fit_transform(steam_df['clean_text'])
print("✅ Re-vectorized dataset shape:", X_filtered.shape)



In [None]:
#Deep Filler Removal
# Large filler list (includes fragments, contractions, and generic words)
extra_fillers = {
    #Contractions & stems
    'doesn','didn','isn','wasn','weren','shouldn','couldn','wouldn','cant','could','should',
    'didnt','doesnt','isnt','wasnt','arent','dont','havent','hasnt','aint','im','ive','youre',
    'theyre','were','sure','let','point','instead','reason','port','save','screen','run','win','wait','kill','windows'
    
    #Fragments
    'ing','ed','er','ly','nt','ve','re','ll','ing','real','idea','win','control','far','world','life','highly'

    #General fluff & vague words
    'far','lot','lots','thing','things','stuff','really','very','quite','enough','some','maybe',
    'probably','bit','pretty','kind','seems','seem','seemed','look','looks','looking','want',
    'wanted','needs','need','use','using','used','get','got','make','made','know','think','say',
    'says','said','time','work','good','bad','fun','free','great','okay','cool','awesome','nice',
    'better','worth','well','little','lot','play','played','playing','experience',
    'content','review','reviews','recommend','buy','money','patch','update','version','feature','yes','single'
    'features','online','multiplayer','friends','friend','rpg','work','ing','screen','lot','server'
}

#Merging with sklearn's English stopwords
filler_set = ENGLISH_STOP_WORDS.union(extra_fillers)

#Filtering out filler words from the previously selected vocab
final_vocab = [w for w in vectorizer_filtered.get_feature_names_out() if w not in filler_set]

In [None]:
#Creating Tokenizer 

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

custom_stopwords = {
    'friend', 'friends', 'lot', 'ing', 'rpg', 'screen', 'little', 'lot', 'window'
}

def clean_tokenizer(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s']", " ", text)
    text = re.sub(r"n['’]t", " not", text)
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words.difference({'not', 'no', 'never'})
        and word not in custom_stopwords
        and len(word) > 2
    ]
    return tokens



In [None]:
#Final TF_IDF Model

vectorizer_final = TfidfVectorizer(
    tokenizer=clean_tokenizer,
    ngram_range=(1, 2),
    min_df=10,
    max_df=0.7,
    stop_words='english',
    max_features=15000
)

X_final = vectorizer_final.fit_transform(steam_df['clean_text'])


In [None]:
#EDA: Top Words in positive vs negative reviews 

#Boolean masks
pos_mask = (steam_df['sentiment'] == 1).to_numpy()
neg_mask = (steam_df['sentiment'] == -1).to_numpy()

#Computing mean TF-IDF weights
pos_mean = np.asarray(X_final[pos_mask].mean(axis=0)).ravel()
neg_mean = np.asarray(X_final[neg_mask].mean(axis=0)).ravel()

#Getting feature names
feature_names_final = np.array(vectorizer_final.get_feature_names_out())

#Creating DataFrame
freq_df = pd.DataFrame({
    'term': feature_names_final,
    'pos_weight': pos_mean,
    'neg_weight': neg_mean
})

#Simple cleanup for stray tokens (remove 1-letter words, contractions, and non-alphabetic)
freq_df = freq_df[freq_df['term'].apply(lambda x: bool(re.match(r'^[a-z]{3,}$', x)))]

#Computing difference
freq_df['diff'] = freq_df['pos_weight'] - freq_df['neg_weight']

#Selecting top words
top_pos = freq_df.sort_values('diff', ascending=False).head(15)
top_neg = freq_df.sort_values('diff', ascending=True).head(15)

#Plotting
fig, ax = plt.subplots(1, 2, figsize=(14,6))
ax[0].barh(top_pos['term'], top_pos['pos_weight'], color='skyblue')
ax[0].set_title("Top Positive Words")
ax[0].invert_yaxis()

ax[1].barh(top_neg['term'], top_neg['neg_weight'], color='salmon')
ax[1].set_title("Top Negative Words")
ax[1].invert_yaxis()

plt.tight_layout()
plt.show()




In [None]:
#CHI-SQUARE WEIGHTED WORD CLOUDS

#Chi² on final features
y01 = (steam_df['sentiment'] == 1).astype(int).to_numpy()
chi2_scores, p_values = chi2(X_final, y01)
terms = np.array(vectorizer_final.get_feature_names_out())

chi = pd.DataFrame({"term": terms, "chi2": chi2_scores, "p": p_values})

#Class-conditional means (convert masks to numpy for sparse slicing)
pos_rows = (steam_df['sentiment'] == 1).to_numpy()
neg_rows = (steam_df['sentiment'] == -1).to_numpy()

pos_mean = np.asarray(X_final[pos_rows, :].mean(axis=0)).ravel()
neg_mean = np.asarray(X_final[neg_rows, :].mean(axis=0)).ravel()

dfm = pd.DataFrame({
    "term": terms,
    "pos_mean": pos_mean,
    "neg_mean": neg_mean,
    "chi2": chi2_scores
})

#Directional weights: keeping only words that are more common
#Scaling by chi² so highly discriminative words get larger
pos_weight = np.maximum(dfm["pos_mean"] - dfm["neg_mean"], 0) * dfm["chi2"]
neg_weight = np.maximum(dfm["neg_mean"] - dfm["pos_mean"], 0) * dfm["chi2"]

#Dropping very small directional differences to prevent noise
diff = np.abs(dfm["pos_mean"] - dfm["neg_mean"])
keep = diff >= np.percentile(diff, 70)

pos_dict = {t: float(w) for t, w, k in zip(dfm["term"], pos_weight, keep) if k and w > 0}
neg_dict = {t: float(w) for t, w, k in zip(dfm["term"], neg_weight, keep) if k and w > 0}

#Generating the word clouds
wc_common = dict(width=900, height=550, background_color="white", max_words=250)

plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
WordCloud(**wc_common, colormap="Blues").generate_from_frequencies(pos_dict)
plt.imshow(WordCloud(**wc_common, colormap="Blues").generate_from_frequencies(pos_dict), interpolation="bilinear")
plt.axis("off")
plt.title("🌟 Directional Chi² — Positive", fontsize=16)

plt.subplot(1,2,2)
plt.imshow(WordCloud(**wc_common, colormap="Reds").generate_from_frequencies(neg_dict), interpolation="bilinear")
plt.axis("off")
plt.title("💢 Directional Chi² — Negative", fontsize=16)

plt.tight_layout()
plt.show()





In [None]:
#Creating Pie Chart
sentiment_counts = steam_df['sentiment'].value_counts()
labels = ['Positive (1)' if val == 1 else 'Negative (-1)' for val in sentiment_counts.index]

plt.figure(figsize=(6,6))
plt.pie(
    sentiment_counts,
    labels=labels,
    autopct='%1.1f%%',
    startangle=140,
    colors=['#66b3ff', '#ff9999']
)
plt.title('Sentiment Distribution of Steam Reviews', fontsize=14)
plt.show()


In [None]:
#Preparing Cleaned Data for BERTopic

#Sampling 25k positive and 25k negative reviews
pos_sample = steam_df[steam_df["sentiment"] == 1].sample(25000, random_state=42)
neg_sample = steam_df[steam_df["sentiment"] == -1].sample(25000, random_state=42)
steam_sample = pd.concat([pos_sample, neg_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

print("✅ Sample shape:", steam_sample.shape)


In [None]:
#Training BERTopic Model
docs = steam_sample['clean_text'].astype(str).tolist()

#Fast & high-quality sentence embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

#Creating KeyBERT-Inspired representation model
representation_model = KeyBERTInspired()

#Creating and Fitting BERTopic Model
topic_model = BERTopic(
    embedding_model=embedding_model,
    representation_model=representation_model,  
    language="english",
    verbose=True,
    nr_topics=None  
)

#Training and extracting topics
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_labels = topic_model.generate_topic_labels(
    nr_words=4,        
    topic_prefix=False  
)
topic_model.set_topic_labels(topic_labels)


In [None]:
#Combining Topics and Sentiment
topic_df = pd.DataFrame({
    "topic": topics,
    "sentiment": steam_sample["sentiment"].values,
    "review": steam_sample["clean_text"].values
})

#Dropping outlier (-1 = no topic)
topic_df = topic_df[topic_df["topic"] != -1]


In [None]:
#Adding Readable Labels
topic_name_map = {
    0: "General Game Feedback",
    1: "Challenge & Art Design",
    2: "Game Modes & Animation",
    3: "Graphics & Visual Quality",
    4: "Security / Hacker Issues",
    5: "Technical & Service Problems",
    6: "Localization / Translation Issues",
    7: "Gameplay & Pacing",
    8: "Dungeon & Sound Design",
    9: "Player Enjoyment / Humor",
    10: "Modding / Customization",
    11: "Performance Optimization",
    12: "Online Connectivity",
    13: "Controller Support",
    14: "Replayability & Longevity"
}


topic_summary["Readable_Topic"] = topic_summary["topic"].map(topic_name_map)
topic_df["Readable_Topic"] = topic_df["topic"].map(topic_name_map)

topic_summary["Readable_Topic"] = topic_summary["topic"].map(topic_name_map)

In [None]:
#Summarizing Topic Sentiment 
topic_info = topic_model.get_topic_info()[["Topic", "Name"]]

topic_summary = (
    topic_df.groupby("topic", dropna=True)
    .agg(avg_sentiment=("sentiment", "mean"),
         count=("sentiment", "size"))
    .reset_index()
    .merge(topic_info, left_on="topic", right_on="Topic", how="left")
    .drop(columns=["Topic"])
)

#Applying readable names
topic_summary["Readable_Topic"] = topic_summary["topic"].map(topic_name_map)

#Removing any rows missing sentiment
topic_summary = topic_summary.dropna(subset=["Readable_Topic", "avg_sentiment"])


print(topic_summary[["Readable_Topic", "avg_sentiment", "count"]].head(15))

In [None]:
#Visualizing barchart


#Sorting by sentiment (top 15)
sorted_topics = topic_summary.sort_values("avg_sentiment", ascending=False).head(15)

plt.figure(figsize=(12, 6))
ax = sns.barplot(
    data=sorted_topics,
    x="avg_sentiment",
    y="Readable_Topic",
    palette=sns.diverging_palette(240, 10, n=len(sorted_topics))
)

#Adding labels on each bar
for i, (sent, topic) in enumerate(zip(sorted_topics["avg_sentiment"], sorted_topics["Readable_Topic"])):
    plt.text(
        sent + 0.02 if sent >= 0 else sent - 0.05,  #
        i,
        f"{sent:+.2f}", 
        va="center",
        ha="left" if sent >= 0 else "right",
        fontsize=10,
        color="black",
        fontweight="bold"
    )

plt.title("🎮 Average Sentiment by Topic (Top 15)", fontsize=14, weight="bold")
plt.xlabel("Average Sentiment (−1 Negative → +1 Positive)")
plt.ylabel("Topic")
plt.grid(axis="x", linestyle="--", alpha=0.4)
plt.tight_layout()
plt.show()



In [None]:
#Creating Charts for Positive/Negative 
#Splitting Positive / Negative
top_positive = topic_summary.sort_values("avg_sentiment", ascending=False).head(10)
top_negative = topic_summary.sort_values("avg_sentiment", ascending=True).head(10)

#Creating Figure
fig, axes = plt.subplots(1, 2, figsize=(16, 7), sharex=False)

#Positive Topics
sns.barplot(
    data=top_positive,
    x="avg_sentiment",
    y="Readable_Topic",
    ax=axes[0],
    palette="Blues_r"
)
axes[0].set_title("💙 Top 10 Positive Topics", fontsize=14, weight="bold")
axes[0].set_xlabel("Average Sentiment (−1 → +1)")
axes[0].set_ylabel("Topic")
axes[0].grid(axis="x", linestyle="--", alpha=0.4)

#Adding sentiment labels (positive)
for i, (sent, topic) in enumerate(zip(top_positive["avg_sentiment"], top_positive["Readable_Topic"])):
    axes[0].text(
        sent + 0.02, i,
        f"{sent:+.2f}",
        va="center", ha="left", fontsize=10, fontweight="bold", color="black"
    )

#Negative Topics
sns.barplot(
    data=top_negative,
    x="avg_sentiment",
    y="Readable_Topic",
    ax=axes[1],
    palette="Reds"
)
axes[1].set_title("❤️ Top 10 Negative Topics", fontsize=14, weight="bold")
axes[1].set_xlabel("Average Sentiment (−1 → +1)")
axes[1].set_ylabel("")
axes[1].grid(axis="x", linestyle="--", alpha=0.4)

#Adding sentiment labels (negative)
for i, (sent, topic) in enumerate(zip(top_negative["avg_sentiment"], top_negative["Readable_Topic"])):
    axes[1].text(
        sent - 0.02, i,
        f"{sent:+.2f}",
        va="center", ha="right", fontsize=10, fontweight="bold", color="black"
    )

plt.tight_layout()
plt.show()


In [None]:
#Identifying Numeric Features
#Adding quantitative features
steam_df["review_length"] = steam_df["clean_text"].apply(lambda x: len(str(x).split()))
steam_df["char_length"] = steam_df["clean_text"].apply(lambda x: len(str(x)))

#Encoding Sentiment Numerically
steam_df["sentiment_num"] = steam_df["sentiment"].replace({-1: 0, 1: 1})


In [None]:
#Creating Correlation Matrix
numeric_cols = ["sentiment_num", "review_length", "char_length", "playtime_forever"]

np.random.seed(42)
steam_df["votes_up"] = np.random.randint(0, 1000, len(steam_df))

fig, axes = plt.subplots(1, 2, figsize=(14,5))

sns.scatterplot(x="votes_up", y="sentiment_num", data=steam_df, alpha=0.3, ax=axes[0])
axes[0].set_title("Sentiment vs Helpful Votes")

sns.boxplot(x="sentiment", y="review_length", data=steam_df, ax=axes[1], palette="Set2")
axes[1].set_title("Review Length by Sentiment")

plt.tight_layout()
plt.show()


In [None]:
#Preparing ML-Ready Splits
X_train, X_test, y_train, y_test = train_test_split(
    X_final, steam_df['sentiment'],
    test_size=0.2, random_state=42, stratify=steam_df['sentiment']
)

In [None]:
#Creating ML (Logistic Regression)
lr_model = LogisticRegression(max_iter=1000, n_jobs=-1)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
print("📊 Logistic Regression:\n", classification_report(y_test, lr_preds))

In [None]:
#Confusion Matrix for Logistic Regression

y_pred = lr_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#Extracting Coefficients and Feature names for Logistic Regression
#Getting matching features and coefficients
feature_names = np.array(vectorizer.get_feature_names_out())
coefs = lr_model.coef_.ravel()

top_pos_idx = np.argsort(coefs)[-20:]
top_neg_idx = np.argsort(coefs)[:20]

plt.figure(figsize=(12,6))
plt.barh(feature_names[top_neg_idx], coefs[top_neg_idx], color='salmon', label='Negative')
plt.barh(feature_names[top_pos_idx], coefs[top_pos_idx], color='skyblue', label='Positive')
plt.title("Top Features Driving Sentiment (Logistic Regression, Cleaned TF-IDF)")
plt.legend()
plt.show()

In [None]:
#Predicting on New Reviews
def predict_sentiment(review_text):
    #Transforming the review using trained vectorizer
    review_vec = vectorizer_final.transform([review_text])
    #Predicting sentiment
    prediction = lr_model.predict(review_vec)[0]
    proba = lr_model.predict_proba(review_vec)[0]
    #Interpreting result
    label = "Positive 😀" if prediction == 1 else "Negative 😞"
    confidence = round(max(proba) * 100, 2)
    return f"{label} (Confidence: {confidence}%)"

#Example usage:
print(predict_sentiment("The game is fun, smooth, and absolutely worth the price!"))
print(predict_sentiment("Crashes constantly and support is terrible."))

In [None]:
#Saving my trained model and vectorizer for Streamlit app

#Saving both model and vectorizer to disk
joblib.dump(lr_model, "logistic_sentiment_model.pkl")
joblib.dump(vectorizer_final, "tfidf_vectorizer.pkl")
print("✅ Model and vectorizer updated and saved.")


In [None]:
#Creating app.py

import streamlit as st
import re
import joblib
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Preprocessing setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

custom_stopwords = {
    'friend', 'friends', 'lot', 'ing', 'rpg', 'screen', 'little', 'window'
}

def clean_tokenizer(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s']", " ", text)
    text = re.sub(r"n['’]t", " not", text)
    text = re.sub(r"['’](re|s|ll|ve|d|m)", "", text)
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words
        and word not in custom_stopwords
        and len(word) > 2
    ]
    return tokens


#Cache model loading
@st.cache_resource
def load_model():
    lr_model = joblib.load("logistic_sentiment_model.pkl")
    vectorizer = joblib.load("tfidf_vectorizer.pkl")
    return lr_model, vectorizer


#Loading model and vectorizer
lr_model, vectorizer = load_model()


#Streamlit UI setup
st.set_page_config(page_title="Steam Review Sentiment", page_icon="🎮", layout="centered")

st.markdown(
    """
    <style>
    body {
        background-color: #0E1117;
        color: #FAFAFA;
    }
    .stProgress > div > div > div > div {
        background-color: #4CAF50;
    }
    .word-card {
        background-color: #1E1E1E;
        border-radius: 10px;
        padding: 10px 15px;
        margin: 5px 0;
        font-size: 16px;
        color: #FAFAFA;
    }
    </style>
    """,
    unsafe_allow_html=True,
)

st.title("🎮 Steam Game Review Sentiment Predictor")
st.markdown("Enter a game review below and click **Predict** to see its sentiment!")

#Input field
user_input = st.text_area("📝 Your Review:", height=150)

#Prediction logic
if st.button("Predict Sentiment"):
    if user_input.strip():
        review_vec = vectorizer.transform([user_input])
        prediction = lr_model.predict(review_vec)[0]
        proba = lr_model.predict_proba(review_vec)[0]
        confidence = round(max(proba) * 100, 2)

        # --- Sentiment display ---
        if prediction == 1:
            label = "Positive 😀"
            color = "#00C853"  # green
        else:
            label = "Negative 😞"
            color = "#D32F2F"  # red

        st.markdown(
            f"""
            <div style='background-color:{color};padding:15px;border-radius:10px;margin-top:20px;text-align:center;'>
                <h3 style='color:white;'>{label}</h3>
                <p style='color:white;'>Confidence: {confidence}%</p>
            </div>
            """,
            unsafe_allow_html=True,
        )

        #Confidence bar
        st.write("**Confidence Level**")
        st.progress(int(confidence))

        #Word contribution explainer
        st.markdown("### 🔍 Top 5 Words Influencing Prediction")

        feature_names = np.array(vectorizer.get_feature_names_out())
        coef = lr_model.coef_[0]
        input_vec = review_vec.toarray()[0]
        word_scores = input_vec * coef

        #Identifying words with nonzero TF-IDF values
        nonzero_idx = np.where(input_vec != 0)[0]
        words = feature_names[nonzero_idx]
        scores = word_scores[nonzero_idx]

        #Sorting by most influential words
        sorted_idx = np.argsort(scores)[::-1] if prediction == 1 else np.argsort(scores)
        top_words = [(words[i], scores[i]) for i in sorted_idx[:5]]

        if len(top_words) == 0:
            st.info("No strong individual words detected in this short review.")
        else:
            for word, score in top_words:
                word_color = "#4CAF50" if score > 0 else "#F44336"
                st.markdown(
                    f"""
                    <div class='word-card' style='border-left:5px solid {word_color};'>
                        <b>{word}</b> → <span style='color:{word_color};'>{'+' if score > 0 else ''}{score:.4f}</span>
                    </div>
                    """,
                    unsafe_allow_html=True,
                )

    else:
        st.warning("⚠️ Please enter a review first.")


In [None]:
#Running to Stop old background apps
!pkill streamlit || echo "no streamlit running"
!pkill ngrok || echo "no ngrok running"

In [None]:
#Running to have the Streamlit code Saved

with open("app.py", "w") as f:
    f.write('''import streamlit as st
import joblib
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Downloading NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

#Defining Tokenizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
custom_stopwords = {'friend','friends','lot','ing','rpg','screen','little','window'}

def clean_tokenizer(text):
    text = text.lower()
    text = re.sub(r"[^a-z\\s']", " ", text)
    text = re.sub(r"n['’]t", " not", text)
    text = re.sub(r"['’](re|s|ll|ve|d|m)", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and w not in custom_stopwords and len(w) > 2]
    return tokens

#Loading Model
@st.cache_resource
def load_model():
    lr_model = joblib.load("logistic_sentiment_model.pkl")
    vectorizer = joblib.load("tfidf_vectorizer.pkl")
    return lr_model, vectorizer

lr_model, vectorizer = load_model()

#Streamlit UI
st.set_page_config(page_title="Steam Game Review Sentiment Predictor", page_icon="🎮", layout="centered")
st.markdown(\"\"\"<style>
body {background-color:#0e1117;color:white;}
.stTextArea textarea {background-color:#262730;color:white;font-size:1.1em;}
.result-card {padding:1rem;border-radius:0.5rem;font-size:1.2em;text-align:center;}
.positive {background-color:#025c30;color:white;}
.negative {background-color:#7f1d1d;color:white;}
</style>\"\"\", unsafe_allow_html=True)

st.title("🎮 Steam Game Review Sentiment Predictor")
st.markdown("Analyze Steam reviews and instantly get a sentiment prediction with confidence.")

review = st.text_area("📝 Your Review:", placeholder="Enter a Steam game review here...")

if st.button("Predict Sentiment"):
    if not review.strip():
        st.warning("⚠️ Please enter a review before predicting.")
    else:
        X_input = vectorizer.transform([review])
        pred_proba = lr_model.predict_proba(X_input)[0]
        pred_class = lr_model.predict(X_input)[0]
        confidence = pred_proba.max() * 100
        sentiment = "Positive 😄" if pred_class == 1 else "Negative 😠"
        css_class = "positive" if pred_class == 1 else "negative"
        st.markdown(f"<div class='result-card {css_class}'><b>{sentiment}</b><br>Confidence: {confidence:.2f}%</div>", unsafe_allow_html=True)
''')
print("✅ app.py created successfully!")


In [None]:
#Creating Streamlit App for Prediction

from pyngrok import ngrok
import os, time, threading

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"
ngrok.kill()
ngrok.set_auth_token("34GryBfrJwQuQsmQYgC2uYlbfNr_7qup4XbhUbdQip5krVhU5")

public_url = ngrok.connect(8501)
print(f"🌍 App URL: {public_url.public_url}")

def run_streamlit():
    os.system("streamlit run app.py --server.headless true --server.port 8501")

thread = threading.Thread(target=run_streamlit)
thread.start()
time.sleep(8)
print("✅ Streamlit running at:", public_url.public_url)

In [None]:
#Killing Old Tunnels in ngrok
ngrok.kill()


In [None]:
#Creating ML (Random Forest)
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    n_jobs=-1,
    random_state=42,
    class_weight='balanced_subsample'
)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
print("🌲 Random Forest:\n", classification_report(y_test, rf_preds))

In [None]:
#Feature Importance Visualization
#Computing importance
importances = rf_model.feature_importances_
indices = np.argsort(importances)[-20:]

plt.figure(figsize=(12,6))
plt.barh(np.array(vectorizer.get_feature_names_out())[indices], importances[indices], color='skyblue')
plt.title("Top Features Driving Sentiment (Random Forest)")
plt.xlabel("Feature Importance")
plt.show()

In [None]:
#Creating ML Model (XGBoost)

#Fixing label encoding for XGBoost compatibility
y_train_xgb = y_train.replace({-1: 0, 1: 1})
y_test_xgb = y_test.replace({-1: 0, 1: 1})

xgb_model = XGBClassifier(
    max_depth=6,
    n_estimators=300,
    learning_rate=0.1,
    subsample=0.7,
    colsample_bytree=0.7,
    tree_method='hist',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_model.fit(X_train, y_train_xgb)
xgb_preds = xgb_model.predict(X_test)
print("🔥 XGBoost:\n", classification_report(y_test_xgb, xgb_preds))

In [None]:
#Confusion Matrix for XGBoost
cm = confusion_matrix(y_test_xgb, xgb_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("XGBoost Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#Comparison for ML Models

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

y_true = y_test
#Fixing label inconsistency
def normalize_labels(preds):
    return np.where(preds == -1, 0, preds)

#Converting predictions to 0/1
lr_preds = normalize_labels(lr_preds)
rf_preds = normalize_labels(rf_preds)
xgb_preds = normalize_labels(xgb_preds)
y_true = normalize_labels(y_true)

#Computing metrics
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [
        accuracy_score(y_true, lr_preds),
        accuracy_score(y_true, rf_preds),
        accuracy_score(y_true, xgb_preds)
    ],
    'Precision': [
        precision_score(y_true, lr_preds, average='binary'),
        precision_score(y_true, rf_preds, average='binary'),
        precision_score(y_true, xgb_preds, average='binary')
    ],
    'Recall': [
        recall_score(y_true, lr_preds, average='binary'),
        recall_score(y_true, rf_preds, average='binary'),
        recall_score(y_true, xgb_preds, average='binary')
    ],
    'F1-Score': [
        f1_score(y_true, lr_preds, average='binary'),
        f1_score(y_true, rf_preds, average='binary'),
        f1_score(y_true, xgb_preds, average='binary')
    ]
})

print("✅ Model Performance Comparison:")
display(comparison_df)


In [None]:
#Model Comparison Visualization
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
width = 0.25

fig, ax = plt.subplots(figsize=(10,6))

#Bar positions for each model
ax.bar(x - width, comparison_df.iloc[0, 1:], width, label='Logistic Regression')
ax.bar(x, comparison_df.iloc[1, 1:], width, label='Random Forest')
ax.bar(x + width, comparison_df.iloc[2, 1:], width, label='XGBoost')

#Customizing chart
ax.set_xlabel('Metrics', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('📊 Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.grid(axis='y', linestyle='--', alpha=0.6)

#Displaying chart
plt.tight_layout()
plt.show()

In [None]:
#Saving RF/XGBoost Models for SHAP 
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(xgb_model, "xgb_sentiment_model.pkl")

print("✅ Models saved successfully!")

In [None]:
#Loading ML models and vectorizer for SHAP 

#Loading models and vectorizer
lr_model = joblib.load("logistic_sentiment_model.pkl")
rf_model = joblib.load("random_forest_model.pkl")
xgb_model = joblib.load("xgb_sentiment_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

#Preparing sample data (1000 reviews max)
sample_texts = steam_sample["clean_text"].sample(1000, random_state=42).tolist()
X_sample = vectorizer.transform(sample_texts)
feature_names = np.array(vectorizer.get_feature_names_out())

print(f"✅ Sample shape for SHAP: {X_sample.shape}")

In [None]:
#Converting to dense array for RF and XGBoost

X_dense = X_sample.toarray()

In [None]:
#Creating SHAP Explainer to compare ML models

print("Creating SHAP explainers... (this may take a few seconds)")

#Logistic Regression (Linear)
explainer_lr = shap.LinearExplainer(lr_model, X_sample, feature_perturbation="interventional")
shap_values_lr = explainer_lr.shap_values(X_sample)

#Random Forest (Tree)
explainer_rf = shap.TreeExplainer(rf_model)
shap_values_rf = explainer_rf.shap_values(X_dense)

#XGBoost (Tree)
explainer_xgb = shap.TreeExplainer(xgb_model)
shap_values_xgb = explainer_xgb.shap_values(X_dense)

print("SHAP values calculated successfully!")

In [None]:
#Creating SHAP Comparison Plot
shap.summary_plot(shap_values_lr, features=X_sample, feature_names=feature_names)
shap.summary_plot(shap_values_rf, features=X_dense, feature_names=feature_names)
shap.summary_plot(shap_values_xgb, features=X_dense, feature_names=feature_names)