# Amazon Mobile Reviews NLP Coursework
End-to-end workflow: clean reviews, build features, explore linguistics, topics, sentiment, and clustering.


## 0) Config & Imports
Set seeds, configure paths, and load core libraries.


In [None]:
import warnings
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

# Notebook-wide settings
pd.set_option('display.max_colwidth', 160)
sns.set_theme(style='whitegrid', palette='viridis')
warnings.filterwarnings('ignore')

# Reproducibility and dataset config
DATA_PATH = Path('dataset/Amazon_Unlocked_Mobile.csv')
SAMPLE_SIZE = 2000
RANDOM_SEED = 42
N_TFIDF_FEATURES = 10000
N_TFIDF_TOP_TERMS = 15
POS_NER_SAMPLE = 5
NER_SUBSET = 500

np.random.seed(RANDOM_SEED)


In [None]:
# Download required NLTK resources (idempotent)
for pkg in ['punkt', 'stopwords', 'wordnet', 'omw-1.4', 'vader_lexicon']:
    nltk.download(pkg, quiet=True)


## 1) Load & Sample Data
Read the dataset, drop empty reviews, and work on a reproducible sample to keep runtime manageable.


In [None]:
if not DATA_PATH.exists():
    raise FileNotFoundError(f'Missing dataset at {DATA_PATH}.')

# Load and lightly clean
columns_to_use = ['Reviews']
df_raw = pd.read_csv(DATA_PATH, usecols=columns_to_use)
df_raw = df_raw.dropna(subset=['Reviews']).reset_index(drop=True)

# Sample for faster iteration
sample_n = min(SAMPLE_SIZE, len(df_raw))
df_sample = df_raw.sample(n=sample_n, random_state=RANDOM_SEED).reset_index(drop=True)
df_sample['Original'] = df_sample['Reviews'].astype(str)

print(f'Loaded {len(df_raw):,} reviews; using {len(df_sample):,} rows for experiments.')
df_sample.head()


## 2) Part A – Text Preprocessing
Clean text, strip noise, remove stopwords, and derive stemmed/lemmatized tokens for downstream tasks.


In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'http\S+|www\S+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def tokenize_and_remove_stopwords(text: str):
    tokens = word_tokenize(text)
    return [t for t in tokens if t not in stop_words and len(t) > 2]


def stem_tokens(tokens):
    return [stemmer.stem(t) for t in tokens]


def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(t) for t in tokens]


In [None]:
# Apply preprocessing pipeline

df_proc = df_sample.copy()
df_proc['Cleaned'] = df_proc['Original'].apply(clean_text)
df_proc['Tokens'] = df_proc['Cleaned'].apply(tokenize_and_remove_stopwords)
df_proc['Stemmed_Tokens'] = df_proc['Tokens'].apply(stem_tokens)
df_proc['Lemmatized_Tokens'] = df_proc['Tokens'].apply(lemmatize_tokens)

# Stringified versions for quick display
df_proc['Stemmed'] = df_proc['Stemmed_Tokens'].apply(lambda toks: ' '.join(toks))
df_proc['Lemmatized'] = df_proc['Lemmatized_Tokens'].apply(lambda toks: ' '.join(toks))


In [None]:
# Preview a few before/after rows
preview_table = df_proc.sample(n=min(5, len(df_proc)), random_state=123)[['Original', 'Cleaned', 'Stemmed', 'Lemmatized']]
preview_table


## 3) Part B – TF-IDF Feature Extraction
Build a sparse TF-IDF matrix (unigrams + bigrams) from lemmatized text and inspect the most informative terms.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = df_proc['Lemmatized'].astype(str).tolist()

vectorizer = TfidfVectorizer(
    max_df=0.85,
    min_df=5,
    ngram_range=(1, 2),
    max_features=N_TFIDF_FEATURES,
    stop_words=None,
    sublinear_tf=True
)

X_tfidf = vectorizer.fit_transform(corpus)
feature_names = np.array(vectorizer.get_feature_names_out())

print('TF-IDF matrix shape:', X_tfidf.shape)


In [None]:
# Top weighted terms across the corpus

top_n = N_TFIDF_TOP_TERMS
mean_tfidf = X_tfidf.mean(axis=0).A1
top_indices = mean_tfidf.argsort()[::-1][:top_n]

pd.DataFrame({
    'Rank': np.arange(1, top_n + 1),
    'Term': feature_names[top_indices],
    'Mean_TF_IDF': mean_tfidf[top_indices]
})


## 4) Part C – POS Tagging & Named Entity Recognition
Use spaCy to tag parts of speech and extract key entities. Uncomment installs below if the model is missing.


In [None]:
# If spaCy or the model is missing, uncomment:
# !pip install spacy
# !python -m spacy download en_core_web_sm

import spacy

try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    raise OSError('spaCy model en_core_web_sm is not installed. Uncomment the install lines above to fetch it.')


In [None]:
# Sample a few reviews for qualitative POS/NER view
pos_ner_sample = df_proc.sample(n=min(POS_NER_SAMPLE, len(df_proc)), random_state=77).copy()

def get_pos_tags(doc):
    return ' '.join([f"{token.text}/{token.pos_}" for token in doc])


def get_entities(doc):
    if not doc.ents:
        return ''
    return ', '.join([f"{ent.text} ({ent.label_})" for ent in doc.ents])

sample_docs = list(nlp.pipe(pos_ner_sample['Original'].tolist()))

pos_ner_sample['POS_Tags'] = [get_pos_tags(doc) for doc in sample_docs]
pos_ner_sample['Named_Entities'] = [get_entities(doc) for doc in sample_docs]

pos_ner_sample[['Original', 'POS_Tags', 'Named_Entities']]


In [None]:
# Aggregate PRODUCT / ORG / DATE entities over a wider subset
from collections import Counter

ner_pool = df_proc.sample(n=min(NER_SUBSET, len(df_proc)), random_state=101)['Original'].tolist()
entity_counter = Counter()

for doc in nlp.pipe(ner_pool, batch_size=32, n_process=1):
    for ent in doc.ents:
        if ent.label_ in ['PRODUCT', 'ORG', 'DATE']:
            entity_counter[(ent.text, ent.label_)] += 1

entity_rows = [
    {'Entity_Text': text, 'Label': label, 'Count': count}
    for (text, label), count in entity_counter.most_common(20)
]

pd.DataFrame(entity_rows)


## 5) Part D – Topic Modelling (LDA)
Train an LDA model on lemmatized tokens and visualise keyword distributions per topic. Uncomment installs if needed.


In [None]:
# If needed:
# !pip install gensim
# !pip install seaborn

import gensim
from gensim import corpora


In [None]:
# Build dictionary and corpus
lda_tokens = df_proc['Lemmatized_Tokens'].tolist()
dictionary = corpora.Dictionary(lda_tokens)
dictionary.filter_extremes(no_below=5, no_above=0.8)

lda_corpus = [dictionary.doc2bow(text) for text in lda_tokens]
print(f'Vocabulary size after filtering: {len(dictionary)}')


In [None]:
# Train LDA
num_topics = 3
lda_model = gensim.models.LdaModel(
    corpus=lda_corpus,
    id2word=dictionary,
    num_topics=num_topics,
    passes=10,
    random_state=RANDOM_SEED
)

for i, topic in lda_model.print_topics(num_words=10):
    print(f'Topic {i+1}: {topic}
')


In [None]:
# Plot top terms for each topic

def plot_topic_keywords(topic_num, lda_model, n_words=10):
    terms = lda_model.show_topic(topic_num, topn=n_words)
    words = [w for w, _ in terms]
    weights = [p for _, p in terms]

    plt.figure(figsize=(8, 5))
    sns.barplot(x=weights, y=words, orient='h')
    plt.title(f'Top {n_words} Keywords for Topic {topic_num+1}')
    plt.xlabel('Weight / Importance')
    plt.ylabel('Keyword')
    plt.tight_layout()
    plt.show()

for t in range(lda_model.num_topics):
    plot_topic_keywords(t, lda_model)


## 6) Part E – Sentiment Analysis (VADER)
Score each review, inspect distribution, and surface potentially ambiguous cases.


In [None]:

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

def classify_sentiment(text):
    scores = sid.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        return 'Positive'
    elif compound <= -0.05:
        return 'Negative'
    return 'Neutral'

sentiment_scores = df_proc['Original'].apply(sid.polarity_scores).apply(pd.Series)
df_proc[['neg', 'neu', 'pos', 'compound']] = sentiment_scores

df_proc['Sentiment'] = df_proc['Original'].apply(classify_sentiment)

df_proc[['Original', 'Sentiment']].head()


In [None]:
# Sentiment distribution
plt.figure(figsize=(7, 5))
sns.countplot(x='Sentiment', data=df_proc, order=['Negative', 'Neutral', 'Positive'])
plt.title('Sentiment Distribution of Amazon Mobile Reviews')
plt.xlabel('Sentiment Category')
plt.ylabel('Number of Reviews')
plt.tight_layout()
plt.show()


In [None]:
# Simple heuristics to flag potential misclassifications
signals = ['but', 'however', 'although']
flagged = []

for _, row in df_proc.iterrows():
    text = row['Original']
    sentiment = row['Sentiment']
    compound = row['compound']
    lower = text.lower()
    if any(s in lower for s in signals) or ('great' in lower and sentiment == 'Negative') or ('terrible' in lower and sentiment == 'Positive'):
        flagged.append((text, sentiment, compound))

flagged[:5]


## 7) Part F – Dimensionality Reduction & Clustering
Project TF-IDF vectors with Truncated SVD (handles sparse data) and inspect structure via scatter + dendrogram.


In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=RANDOM_SEED)
X_svd = svd.fit_transform(X_tfidf)

print('Explained variance ratio (first 2 comps):', svd.explained_variance_ratio_[:2])

df_proc['Comp1'] = X_svd[:, 0]
df_proc['Comp2'] = X_svd[:, 1]


In [None]:
# 2D scatter of first two components
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_proc, x='Comp1', y='Comp2', hue='Sentiment', alpha=0.7)
plt.title('Truncated SVD Projection of TF-IDF Features (coloured by Sentiment)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()


In [None]:
# Hierarchical clustering on a subset
from scipy.cluster.hierarchy import linkage, dendrogram

subset_size = min(120, len(df_proc))
df_subset = df_proc.sample(n=subset_size, random_state=123)

X_subset = df_subset[['Comp1', 'Comp2']].values
Z = linkage(X_subset, method='ward')

plt.figure(figsize=(10, 6))
dendrogram(Z, labels=df_subset['Sentiment'].tolist(), leaf_rotation=90)
plt.title('Hierarchical Clustering Dendrogram (Ward linkage on SVD features)')
plt.xlabel('Review (Sentiment label)')
plt.ylabel('Distance')
plt.tight_layout()
plt.show()


---
**Next ideas:** plug in supervised baselines (e.g., Logistic Regression on TF-IDF) or try BERTopic for richer topics.
