<a href="https://colab.research.google.com/github/SteevAbrahamThomas/new-test/blob/main/exitexam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [22]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/nlp/reviews.csv')
df.head()


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [23]:
# Install NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Import libraries
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#1: Baseline model

In [24]:
# Load a sample of the dataset
df = pd.read_csv('reviews.csv', usecols=['Text', 'Score'], nrows=10000)
df.dropna(inplace=True)
df = df[df['Score'] != 3]  # Remove neutral reviews
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)

# Fast text cleaning function
def quick_clean(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = re.sub(r"\d+", "", text)
    stop_words = set(stopwords.words('english'))
    return " ".join([word for word in text.split() if word not in stop_words])

df['Cleaned_Text'] = df['Text'].apply(quick_clean)


In [25]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Cleaned_Text'])
y = df['Sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.94      0.28      0.43       282
           1       0.88      1.00      0.94      1546

    accuracy                           0.89      1828
   macro avg       0.91      0.64      0.68      1828
weighted avg       0.89      0.89      0.86      1828



In [26]:
print(df.columns)

Index(['Score', 'Text', 'Sentiment', 'Cleaned_Text'], dtype='object')


**Q: Why is TF-IDF often a better choice for text classification than a simple Bag of Words (Count Vectorizer)?**

TF-IDF (Term Frequency-Inverse Document Frequency) not only considers the frequency of words in a document but also penalizes common terms that appear across many documents, reducing their importance. This helps focus on words that are more unique and informative for each document, leading to better discrimination between classes. In contrast, Count Vectorizer simply counts word occurrences, so common but uninformative words may dominate the representation and hurt classification performance.

#2: Word Embedding Model

In [7]:
from google.colab import files
uploaded = files.upload()


Saving reviews.csv to reviews (1).csv


In [29]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [23]:
# Install NLTK resources
import nltk
nltk.download('stopwords')

# Import libraries
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
print(df.columns)


Index(['Score', 'Text', 'Sentiment'], dtype='object')


In [32]:
# Load the dataset
df = pd.read_csv("/content/reviews.csv")  # Adjust path if needed

# Drop missing text
df.dropna(subset=['Text'], inplace=True)

# Remove neutral reviews
df = df[df['Score'] != 3]

# Sample for faster processing
df = df.sample(n=5000, random_state=42)

# Create binary sentiment label
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)



In [33]:
stop_words = set(stopwords.words('english'))

def tokenize(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = text.split()  # Simple tokenizer to avoid punkt_tab error
    return [word for word in tokens if word not in stop_words]

df['Tokens'] = df['Text'].apply(tokenize)



In [34]:
df = df.sample(n=5000, random_state=42)  # Use 5,000 reviews instead of full set

In [35]:
w2v_model = Word2Vec(sentences=df['Tokens'], vector_size=100, window=5, min_count=2, workers=4)

In [36]:
# Efficient vector averaging for each review
def get_vector(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

# Apply with progress bar for visibility
from tqdm import tqdm
tqdm.pandas()

X = np.vstack(df['Tokens'].progress_apply(get_vector))
y = df['Sentiment'].values



100%|██████████| 5000/5000 [00:00<00:00, 12677.01it/s]


In [38]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# rf_model = RandomForestClassifier()
# rf_model.fit(X_train, y_train)

# y_pred = rf_model.predict(X_test)
# print(classification_report(y_test, y_pred))


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.61      0.07      0.13       157
           1       0.85      0.99      0.92       843

    accuracy                           0.85      1000
   macro avg       0.73      0.53      0.52      1000
weighted avg       0.81      0.85      0.79      1000



#Q:What Is One Key Advantage of Using Word2Vec Over TF-IDF?

TF-IDF treats words as independent tokens and ignores their semantic relationships. Word2Vec, however, learns word embeddings based on context, capturing the meaning and similarity between words. For example, "awesome" and "fantastic" may have similar vector representations in Word2Vec, but TF-IDF treats them as unrelated.

This ability to capture semantic meaning makes Word2Vec more powerful for tasks like sentiment analysis, where context and nuance matter.


#3: LSTM Model for Sentiment Classification

In [39]:
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
# Load dataset
df = pd.read_csv("/content/reviews.csv")
df.dropna(subset=['Text'], inplace=True)
df = df[df['Score'] != 3]  # Remove neutral reviews
df = df.sample(n=5000, random_state=42)  # Sample for speed
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)


In [41]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = text.split()
    return " ".join([word for word in tokens if word not in stop_words])

df['Cleaned_Text'] = df['Text'].apply(clean_text)

# Tokenize and pad
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['Cleaned_Text'])
X_seq = tokenizer.texts_to_sequences(df['Cleaned_Text'])
X_pad = pad_sequences(X_seq, maxlen=100)
y = df['Sentiment'].values


In [42]:
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Show model summary for screenshot
model.summary()




In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


Epoch 1/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.7858 - loss: 0.6088 - val_accuracy: 0.8625 - val_loss: 0.4005
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8496 - loss: 0.4092 - val_accuracy: 0.8625 - val_loss: 0.3852
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8540 - loss: 0.3518 - val_accuracy: 0.8637 - val_loss: 0.3389
Epoch 4/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9048 - loss: 0.2294 - val_accuracy: 0.8863 - val_loss: 0.2760
Epoch 5/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9639 - loss: 0.1312 - val_accuracy: 0.9025 - val_loss: 0.2693
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
              precision    recall  f1-score   support

           0       0.74      0.43      0.54       155
           1      

#Q: Why Is LSTM Preferred Over Simple RNN for Text Classification?

Simple RNNs suffer from the **vanishing gradient problem**, which makes it difficult for them to learn long-term dependencies in text. As gradients shrink during backpropagation, early layers receive little signal and fail to learn effectively.

LSTMs solve this by using **gated mechanisms** (input, forget, and output gates) that allow them to retain and update information over longer sequences. This makes LSTMs more robust for tasks like sentiment analysis, where context and word order matter.


#4: Model Comparison

In [52]:
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tqdm import tqdm
tqdm.pandas()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
# Load and sample
df = pd.read_csv("/content/reviews.csv")
df.dropna(subset=['Text'], inplace=True)
df = df[df['Score'] != 3]
df = df.sample(n=5000, random_state=42)
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)

# Tokenize
stop_words = set(stopwords.words('english'))

def tokenize(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = text.split()
    return [word for word in tokens if word not in stop_words]

df['Tokens'] = df['Text'].apply(tokenize)


In [54]:
# Train Word2Vec
w2v_model = Word2Vec(sentences=df['Tokens'], vector_size=100, window=5, min_count=2, workers=4)

# Convert reviews to vectors
def get_vector(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

X_w2v = np.vstack(df['Tokens'].progress_apply(get_vector))
y_w2v = df['Sentiment'].values


100%|██████████| 5000/5000 [00:00<00:00, 7797.92it/s]


In [55]:
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_w2v, y_w2v, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_w2v, y_train_w2v)
y_pred_w2v = rf_model.predict(X_test_w2v)

# Metrics
acc_w2v = accuracy_score(y_test_w2v, y_pred_w2v)
f1_w2v = f1_score(y_test_w2v, y_pred_w2v)
roc_w2v = roc_auc_score(y_test_w2v, y_pred_w2v)


In [56]:
# Reuse X_pad and y from your LSTM pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Rebuild model for clean evaluation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=128, validation_split=0.2)

# Predict and evaluate
y_pred_lstm = (model.predict(X_test_lstm) > 0.5).astype("int32")

acc_lstm = accuracy_score(y_test_lstm, y_pred_lstm)
f1_lstm = f1_score(y_test_lstm, y_pred_lstm)
roc_lstm = roc_auc_score(y_test_lstm, y_pred_lstm)


Epoch 1/5




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.7928 - loss: 0.5972 - val_accuracy: 0.8625 - val_loss: 0.4042
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8503 - loss: 0.4057 - val_accuracy: 0.8625 - val_loss: 0.3829
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.8537 - loss: 0.3437 - val_accuracy: 0.8687 - val_loss: 0.3139
Epoch 4/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9099 - loss: 0.2102 - val_accuracy: 0.8788 - val_loss: 0.2674
Epoch 5/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9590 - loss: 0.1179 - val_accuracy: 0.8988 - val_loss: 0.2785
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [57]:
import pandas as pd

comparison = pd.DataFrame({
    "Model": ["TF-IDF + Logistic Regression", "Word2Vec + Random Forest", "LSTM"],
    "Accuracy": [acc_tfidf, acc_w2v, acc_lstm],
    "F1-Score": [f1_tfidf, f1_w2v, f1_lstm],
    "ROC-AUC": [roc_tfidf, roc_w2v, roc_lstm]
})

comparison.round(4)


Unnamed: 0,Model,Accuracy,F1-Score,ROC-AUC
0,TF-IDF + Logistic Regression,0.861,0.9239,0.5542
1,Word2Vec + Random Forest,0.84,0.9125,0.5128
2,LSTM,0.89,0.9379,0.6847


#Q: Based on your results, which model would you recommend for deployment? Justify your choice by considering not just the performance metrics but also the trade-offs in model complexity, training time, and interpretability

Based on the comparison table, the **LSTM model** delivers the strongest performance across Accuracy, F1-Score, and ROC-AUC. It effectively captures long-term dependencies and semantic relationships in text, making it ideal for sentiment classification.

However, LSTM models are computationally intensive and slower to train. If deployment requires speed and interpretability, **TF-IDF + Logistic Regression** is a simpler and explainable alternative. For production environments where accuracy is critical and resources are available, **LSTM is the recommended choice**.


#5:Streamlit Deployment for Sentiment Prediction

In [58]:
import pickle

# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save model
model.save("lstm_model.h5")




In [59]:
from google.colab import files
files.download("tokenizer.pkl")
files.download("lstm_model.h5")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [61]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0


In [62]:
import streamlit as st
import pickle
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load tokenizer and model
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

model = load_model("lstm_model.h5")

# Streamlit UI
st.title("Amazon Review Sentiment Classifier")
st.write("Paste a product review below to predict its sentiment:")

review = st.text_area("Enter Review Text")

if st.button("Predict Sentiment"):
    if review.strip() == "":
        st.warning("Please enter a review.")
    else:
        # Preprocess and predict
        seq = tokenizer.texts_to_sequences([review])
        padded = pad_sequences(seq, maxlen=100)
        pred = model.predict(padded)[0][0]
        sentiment = "Positive" if pred > 0.5 else "Negative"
        st.success(f"Predicted Sentiment: **{sentiment}**")


2025-10-15 11:18:21.354 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-10-15 11:18:21.362 Session state does not function when running a script without `streamlit run`


In [64]:
model.save("lstm_model.h5")
from google.colab import files
files.download("lstm_model.h5")




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [65]:
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

files.download("tokenizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>