<a href="https://colab.research.google.com/github/Subhangee19/DataAnalysis/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

## Problem Statement
We aim to predict sentiment (positive/negative) from Steam game reviews to support market analysis and content recommendation.

## Business Importance
Understanding review sentiment helps game publishers, platforms, and developers improve game design and target marketing campaigns more effectively.

## Objective
Build an end-to-end NLP pipeline using:
- TF-IDF + Logistic Regression
- Word2Vec + Logistic Regression
- LSTM
And compare them to determine the most effective model.


In [None]:
!pip install nltk gensim transformers datasets scikit-learn bs4
!pip install kagglehub

In [None]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay)

from gensim.models import Word2Vec
from bs4 import BeautifulSoup


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

In [None]:


import kagglehub

# Download and get path
path = kagglehub.dataset_download("piyushagni5/sentiment-analysis-for-steam-reviews")
print("Path to dataset files:", path)


# 1. Data Acquisition

We use the Kaggle dataset: **Sentiment Analysis for Steam Reviews**  
URL: https://www.kaggle.com/datasets/piyushagni5/sentiment-analysis-for-steam-reviews

The dataset contains:
- `user_review`: Text review.
- `user_suggestion`: 1 = positive, 0 = negative.


In [None]:
df = pd.read_csv(f"{path}/train.csv")
df.head()

In [None]:
df.info()
df.describe()
df.isnull().sum()
df.duplicated().sum()

# 2. Exploratory Data Analysis

In this section, we explore:
- The balance between positive and negative reviews.
- The average length of reviews.
- Frequent words using word clouds.


In [None]:

# Sentiment label distribution
sns.countplot(x='user_suggestion', data=df, palette='Set2')
plt.title('Distribution of Sentiment Labels')
plt.xlabel('Sentiment (1=Positive, 0=Negative)')
plt.ylabel('Count')
plt.show()

In [None]:
# Add a column for review length
df['review_length'] = df['user_review'].astype(str).apply(len)

# Plot the length distribution
sns.histplot(df['review_length'], bins=50, color='skyblue')
plt.title('Distribution of Review Lengths')
plt.xlabel('Number of Characters in Review')
plt.ylabel('Frequency')
plt.show()


In [None]:
# WordClouds
text_pos = ' '.join(df[df['user_suggestion'] == 1]['user_review'])
text_neg = ' '.join(df[df['user_suggestion'] == 0]['user_review'])

plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.imshow(WordCloud(width=500, height=300, background_color='white').generate(text_pos))
plt.title('Positive Reviews')
plt.axis('off')

plt.subplot(1,2,2)
plt.imshow(WordCloud(width=500, height=300, background_color='white').generate(text_neg))
plt.title('Negative Reviews')
plt.axis('off')
plt.show()

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


df['cleaned_review'] = df['user_review'].astype(str).apply(clean_text)
df.head()


In [None]:
df['sentiment'] = df['user_suggestion'].map({1: 'positive', 0: 'negative'})
df['sentiment_label'] = df['sentiment'].map({'positive': 1, 'negative': 0})


In [None]:
df["sentiment"].value_counts()

In [None]:
X = df['cleaned_review']
y = df['sentiment_label']

# Single unified split for all models
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:
# user_suggestion: 1 = positive, 0 = negative
df['sentiment'] = df['user_suggestion'].map({1: 'positive', 0: 'negative'})
df['sentiment_label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df[['cleaned_review', 'sentiment', 'sentiment_label']].head()


In [None]:
import seaborn as sns

# Class distribution
sns.countplot(data=df, x='sentiment')
plt.title('Sentiment Class Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# Show class balance numerically too
df['sentiment'].value_counts()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X_tfidf = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment_label']


# 3. Text Preprocessing


In [None]:
df['tokens'] = df['cleaned_review'].apply(lambda x: x.split())



## Word2Vec Embedding
We train Word2Vec to create dense vector representations of words and average them to get review-level features.


In [None]:
# Tokenization
df['tokens'] = df['cleaned_review'].apply(lambda x: x.split())

# Train Word2Vec
w2v_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=2, workers=4, epochs=30)

# Function to get average embedding
def get_avg_vector(tokens):
    valid_tokens = [word for word in tokens if word in w2v_model.wv]
    return np.mean(w2v_model.wv[valid_tokens], axis=0) if valid_tokens else np.zeros(100)

# Apply
X_w2v = np.vstack(df['tokens'].apply(get_avg_vector))


# 4. Feature Engineering

We apply two different feature extraction methods:
- **TF-IDF Vectorization**: Converts documents into a matrix of term frequency and inverse document frequency.
- **Word2Vec Embeddings**: Learns dense vector representations of words from context.

These will be used as input features for our models.


In [None]:
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_w2v = np.vstack(X_train.apply(get_avg_vector))
X_test_w2v = np.vstack(X_test.apply(get_avg_vector))


In [None]:
model_tfidf = LogisticRegression(class_weight='balanced', max_iter=1000)
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("TF-IDF + Logistic Regression")
print(classification_report(y_test, y_pred_tfidf))


In [None]:
cm_tfidf = confusion_matrix(y_test, y_pred_tfidf)
sns.heatmap(cm_tfidf, annot=True, fmt='d', cmap='Blues')
plt.title('TF-IDF + Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
RocCurveDisplay.from_estimator(model_tfidf, X_test_tfidf, y_test)
plt.title('ROC Curve - TF-IDF Model')
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier

model_w2v = RandomForestClassifier(n_estimators=200, random_state=42)
model_w2v.fit(X_train_w2v, y_train)

y_pred_w2v = model_w2v.predict(X_test_w2v)


In [None]:
cm_w2v = confusion_matrix(y_test, y_pred_w2v)

sns.heatmap(cm_w2v, annot=True, fmt='d', cmap='Purples')
plt.title('Word2Vec + Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
RocCurveDisplay.from_estimator(model_w2v, X_test_w2v, y_test)
plt.title('ROC Curve - Word2Vec Model')
plt.show()


In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Test Example
result = classifier("This game is absolutely terrible.")
print(result)

# Apply on full dataset (optional, computationally heavy)
# df['bert_sentiment'] = df['user_review'].apply(lambda x: classifier(x)[0]['label'])


## 5. Deep Learning Model – LSTM

While traditional models like TF-IDF + Logistic Regression and Word2Vec + Logistic Regression are effective for capturing frequency-based or semantic features, they cannot fully capture sequential relationships in text.

LSTM (Long Short-Term Memory) is a type of recurrent neural network (RNN) that captures long-term dependencies in sequences, making it highly suitable for text data where word order matters.

In this section, we implement an LSTM-based model to classify sentiment from Steam reviews.


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# Parameters
vocab_size = 10000
max_length = 100
embedding_dim = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

# Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post', truncating='post')



In [None]:
model_lstm = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_lstm.summary()


In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model_lstm.fit(
    X_train_pad, y_train,
    epochs=10,
    validation_data=(X_test_pad, y_test),
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)


In [None]:
# Evaluate
loss, accuracy = model_lstm.evaluate(X_test_pad, y_test)
print(f'✅ Test Accuracy: {accuracy:.4f}')


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions
y_pred_probs = model_lstm.predict(X_test_pad)
y_pred = (y_pred_probs >= 0.5).astype(int)
y_pred_lstm = y_pred

# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title('LSTM Model Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
comparison_df = pd.DataFrame({
    'Model': [
        'TF-IDF + Logistic Regression',
        'Word2Vec + Random Forest',
        'LSTM (Deep Learning)'
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred_tfidf),
        accuracy_score(y_test, y_pred_w2v),
        accuracy_score(y_test, y_pred_lstm)
    ],
    'Precision': [
        precision_score(y_test, y_pred_tfidf),
        precision_score(y_test, y_pred_w2v),
        precision_score(y_test, y_pred_lstm)
    ],
    'Recall': [
        recall_score(y_test, y_pred_tfidf),
        recall_score(y_test, y_pred_w2v),
        recall_score(y_test, y_pred_lstm)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred_tfidf),
        f1_score(y_test, y_pred_w2v),
        f1_score(y_test, y_pred_lstm)
    ]
})
print(comparison_df.sort_values(by='Accuracy', ascending=False))
