In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import pipeline
from tqdm import tqdm
import torch
import matplotlib.pyplot as plt


In [None]:

# Check if NVIDIA GPU is available
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


In [None]:
df = pd.read_csv("../data/raw/reviews.csv")

In [None]:
df.columns

In [None]:
# Drop rows where review_text is NaN or empty
df = df[~(df["review_text"].isna() | (df["review_text"].str.strip() == ""))]


print(f"Remaining rows after dropping empty reviews: {len(df)}")



In [None]:
duplicate_count = df.duplicated(subset=["review_text"]).sum()
print(f"Number of duplicate reviews: {duplicate_count}")

In [None]:
# Drop duplicate reviews based on review_text, keep the first occurrence
df = df.drop_duplicates(subset=["review_text"], keep="first").reset_index(drop=True)

# Verify
print(f"Remaining rows after dropping duplicates: {len(df)}")
print(f"Unique reviews: {df['review_text'].nunique()}")



In [None]:
# Creates an instance of the PorterStemmer: algorithm for stemming words (reducing words to their root form e.g., "running" -> "run")
stemmer = PorterStemmer()
# Loads a set of common English stop words (e.g., "the", "is", "in") to be removed from text
# These words are often filtered out in text processing as they carry less meaningful information
stop_words = set(stopwords.words("english"))

# Function to clean and preprocess text data
def clean_text(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    # Tokenize, remove stop words, and stem the remaining words
    words = [stemmer.stem(w) for w in text.split() if w not in stop_words]
    # Join the processed words back into a single string
    return " ".join(words)

# Create a new column 'clean_review' in the DataFrame by applying the clean_text function to the 'review_text' column
# New column will contain stemmed and filtered text reviews
df["clean_review"] = df["review_text"].apply(clean_text)

In [None]:
df.to_pickle("../data/processed/cleaned_reviews.pkl")

In [None]:
df = pd.read_pickle("../data/processed/cleaned_reviews.pkl")

In [None]:
df.head(10)

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
# Progress bar for pandas apply to show progress during sentiment analysis
tqdm.pandas()

# Function to get sentiment of a given text
def get_sentiment(text):
    # If the text is empty or only whitespace, return a neutral sentiment
    if not text or len(text.strip()) == 0:
        return {"label": "NEUTRAL", "score": 0.0}
    # Perform sentiment analysis using the pipeline, limiting input to 512 tokens
    result = sentiment_pipeline(text[:512])[0]  # limit to 512 tokens
    return result

df["sentiment_result"] = df["clean_review"].progress_apply(get_sentiment)

In [None]:
df.to_pickle("../data/processed/clean_and_sentiment_score_reviews.pkl")

In [None]:
df.head(10)

In [None]:
df.iloc[8]["review_text"]

In [None]:
df["sentiment_label"] = df["sentiment_result"].apply(lambda x: x["label"])
df["sentiment_score"] = df["sentiment_result"].apply(lambda x: x["score"])

In [None]:
df.head(10)

In [None]:
def classify_sentiment(row, threshold=0.6):
    label = row["sentiment_label"]
    score = row["sentiment_score"]
    if score >= threshold:
        return 1 if label == "POSITIVE" else 0
    else:
        return -1  # Neutral or uncertain sentiment

In [None]:
df["sentiment_binary"] = df.apply(classify_sentiment, axis=1)

In [None]:
df.head(10)

In [None]:
total = len(df)
num_positive = (df["sentiment_binary"] == 1).sum()
num_negative = (df["sentiment_binary"] == 0).sum()
num_uncertain = (df["sentiment_binary"] == -1).sum()
print(f"Total Reviews: {total}")
print(f"Positive: {num_positive}")
print(f"Negative: {num_negative}")
print(f"Uncetain: {num_uncertain}")

In [None]:


counts = [3282252, 2977560, 157294]
labels = ['Positive', 'Negative', 'Uncertain']
colors = ['green', 'red', 'blue']

plt.figure(figsize=(7,5))
plt.bar(labels, counts, color=colors)
plt.title("Sentiment Distribution of Steam Reviews")
plt.ylabel("Number of Reviews")
plt.xlabel("Sentiment Category")
plt.tight_layout()
plt.show()

In [None]:
df["review_score"].unique()

In [None]:

# Count positive and negative reviews based on review_score
pos_before = (df["review_score"] == 1).sum()
neg_before = (df["review_score"] == -1).sum()

# Display counts
print("=== Before Sentiment Classification ===")
print(f"Positive: {pos_before}")
print(f"Negative: {neg_before}")

# Plot with log scale
labels = ["Positive", "Negative"]
counts = [pos_before, neg_before]

plt.figure(figsize=(6, 4))
plt.bar(labels, counts, color=["green", "red"])
plt.title("Sentiment Distribution (Before Classification)")
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews (log scale)")
plt.yscale("log")
plt.tight_layout()
plt.show()



In [None]:
df = df.drop(columns=["review_score", "review_votes", "sentiment_result"])


In [None]:
df.head(10)

In [None]:
df.to_pickle("../data/processed/ready_for_squashing_reviews.pkl")

In [None]:
# Exclude neutral/uncertain reviews
df_valid = df[df["sentiment_binary"] != -1]

# Group by app_id and compute average sentiment
app_sentiment = df_valid.groupby("app_id").agg(
    avg_sentiment=("sentiment_binary", "mean"),
    num_reviews=("sentiment_binary", "count")  # how many confident reviews
).reset_index()

# Merge app_name for reference
app_sentiment = app_sentiment.merge(
    df[["app_id", "app_name"]].drop_duplicates(), 
    on="app_id", 
    how="left"
)

app_sentiment["percent_positive"] = app_sentiment["avg_sentiment"] * 100


print(app_sentiment.head())


In [None]:
app_sentiment

In [None]:
# Keep only apps with at least 10 confident reviews
app_sentiment_filtered = app_sentiment[app_sentiment["num_reviews"] >= 10]

# Optional: check how many apps remain
print(f"Apps with >=10 reviews: {len(app_sentiment_filtered)}")


In [None]:
app_sentiment_filtered.to_pickle("../data/processed/app_level_sentiment.pkl")

In [None]:
app_sentiment_filtered