In [20]:
# !pip install pandas numpy nltk scikit-learn

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import os

# Download once (fixed list)
nltk.download('punkt')
nltk.download('punkt_tab')                       # 👈 important
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng') # 👈 important fix

print("✅ Libraries ready")


✅ Libraries ready


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vedha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vedha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vedha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vedha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nl

In [21]:
# Your dataset path
csv_path = r"C:\Users\vedha\OneDrive\Desktop\Assignment - 5.1\product_reviews.csv"

# Use latin1 to avoid UnicodeDecodeError
df = pd.read_csv(csv_path, encoding="latin1", on_bad_lines="skip")

print("✅ Dataset loaded successfully")
print("Columns:", df.columns.tolist())
df.head()


✅ Dataset loaded successfully
Columns: ['Review_ID', 'Review_Text']


Unnamed: 0,Review_ID,Review_Text
0,1,"""The product is GREAT! Loved it, but its a bi..."
1,2,"""Worst product ever!! Wouldnt recommend to an..."
2,3,"""Satisfactory quality, works as expected, no m..."
3,4,"""Amazing product, I would buy it again and aga..."
4,5,"""The delivery was slow, but the product is good."""


In [22]:
# Manually set the column containing reviews
text_col = "Review_Text"   # 👈 change if needed (e.g. "Review", "Text")

# Show first 10 reviews
df[text_col].head(10)


0    "The product is GREAT! Loved it, but its a bi...
1    "Worst product ever!! Wouldnt recommend to an...
2    "Satisfactory quality, works as expected, no m...
3    "Amazing product, I would buy it again and aga...
4    "The delivery was slow, but the product is good."
5    "Horrible experience, the product broke after ...
6    "Great value for the price! Definitely worth b...
7    "The product didnt meet my expectations, retu...
8    "Im satisfied with the purchase, but there ar...
9    "Superb product! Excellent build quality and g...
Name: Review_Text, dtype: object

In [23]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_text_basic(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def get_wordnet_pos(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_pos_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    cleaned = clean_text_basic(text)
    tokens = word_tokenize(cleaned)
    tokens_nostop = [t for t in tokens if t not in stop_words and len(t) > 1]
    stems = [ps.stem(t) for t in tokens_nostop]
    pos_tags = pos_tag(tokens_nostop, lang="eng")  # 👈 ensure correct tagger
    lemmas = [lemmatizer.lemmatize(t, get_wordnet_pos(p)) for t, p in pos_tags]
    return {
        'cleaned': cleaned,
        'tokens': tokens,
        'tokens_nostop': tokens_nostop,
        'stems': stems,
        'lemmas': lemmas,
        'final_text': " ".join(lemmas)
    }

print("✅ Preprocessing functions ready")

# Quick test
print(preprocess_text(df[text_col].iloc[0]))


✅ Preprocessing functions ready
{'cleaned': 'the product is great loved it but it s a bit pricey', 'tokens': ['the', 'product', 'is', 'great', 'loved', 'it', 'but', 'it', 's', 'a', 'bit', 'pricey'], 'tokens_nostop': ['product', 'great', 'loved', 'bit', 'pricey'], 'stems': ['product', 'great', 'love', 'bit', 'pricey'], 'lemmas': ['product', 'great', 'loved', 'bit', 'pricey'], 'final_text': 'product great loved bit pricey'}


In [24]:
proc = df[text_col].fillna("").astype(str).apply(preprocess_text)

df['cleaned'] = proc.apply(lambda x: x['cleaned'])
df['tokens'] = proc.apply(lambda x: x['tokens'])
df['tokens_nostop'] = proc.apply(lambda x: x['tokens_nostop'])
df['stems'] = proc.apply(lambda x: x['stems'])
df['lemmas'] = proc.apply(lambda x: x['lemmas'])
df['final_text'] = proc.apply(lambda x: x['final_text'])

df[[text_col, 'cleaned', 'final_text']].head(10)


Unnamed: 0,Review_Text,cleaned,final_text
0,"""The product is GREAT! Loved it, but its a bi...",the product is great loved it but it s a bit p...,product great loved bit pricey
1,"""Worst product ever!! Wouldnt recommend to an...",worst product ever wouldn t recommend to anyone,bad product ever recommend anyone
2,"""Satisfactory quality, works as expected, no m...",satisfactory quality works as expected no majo...,satisfactory quality work expect major issue
3,"""Amazing product, I would buy it again and aga...",amazing product i would buy it again and again,amazing product would buy
4,"""The delivery was slow, but the product is good.""",the delivery was slow but the product is good,delivery slow product good
5,"""Horrible experience, the product broke after ...",horrible experience the product broke after ju...,horrible experience product break one use
6,"""Great value for the price! Definitely worth b...",great value for the price definitely worth buying,great value price definitely worth buying
7,"""The product didnt meet my expectations, retu...",the product didn t meet my expectations return...,product meet expectation return
8,"""Im satisfied with the purchase, but there ar...",i m satisfied with the purchase but there are ...,satisfied purchase well option available
9,"""Superb product! Excellent build quality and g...",superb product excellent build quality and gre...,superb product excellent build quality great c...


In [29]:
all_lemmas = df['lemmas'].explode().dropna()
vocab = sorted(all_lemmas.unique())
print("✅ Vocabulary size:", len(vocab))

term_to_idx = {term: idx for idx, term in enumerate(vocab)}
N = len(df)

df_counts = np.zeros(len(vocab), dtype=int)
for lem_list in df['lemmas']:
    if not lem_list:
        continue
    for t in set(lem_list):
        df_counts[term_to_idx[t]] += 1

print("Sample vocab terms:", vocab[:15])


✅ Vocabulary size: 74
Sample vocab terms: ['advertise', 'amazing', 'anyone', 'available', 'bad', 'bit', 'break', 'build', 'buy', 'buying', 'cheap', 'customer', 'decent', 'defective', 'definitely']


In [30]:
V = len(vocab)
TF = np.zeros((N, V))

# Term Frequency
for i, lem_list in enumerate(df['lemmas']):
    counts = {}
    for t in lem_list:
        counts[t] = counts.get(t, 0) + 1
    for term, c in counts.items():
        idx = term_to_idx[term]
        TF[i, idx] = 1.0 + np.log(c + 1.0)

# Inverse Document Frequency
idf = np.log(N / (1 + df_counts)) + 1.0

# TF-IDF
TFIDF_raw = TF * idf[np.newaxis, :]

# Normalize (L2)
from numpy.linalg import norm
TFIDF = TFIDF_raw.copy()
for i in range(N):
    d = norm(TFIDF_raw[i])
    if d > 0:
        TFIDF[i] = TFIDF_raw[i] / d

print("✅ TF-IDF matrix ready:", TFIDF.shape)


✅ TF-IDF matrix ready: (20, 74)


In [31]:
def top_k_terms(tfidf_row, k=10):
    idxs = np.argsort(tfidf_row)[::-1][:k]
    return [(vocab[i], float(tfidf_row[i])) for i in idxs if tfidf_row[i] > 0]

for i in range(5):
    print(f"\nReview {i}:")
    print(top_k_terms(TFIDF[i], k=10))



Review 0:
[('loved', 0.5165220882139961), ('pricey', 0.5165220882139961), ('bit', 0.5165220882139961), ('great', 0.40811433517775597), ('product', 0.18181717901116096)]

Review 1:
[('bad', 0.5067964293540973), ('anyone', 0.5067964293540973), ('ever', 0.5067964293540973), ('recommend', 0.44457599801599057), ('product', 0.17839372065714162)]

Review 2:
[('major', 0.43402080023458084), ('satisfactory', 0.43402080023458084), ('issue', 0.43402080023458084), ('quality', 0.3807351813230127), ('work', 0.3807351813230127), ('expect', 0.3807351813230127)]

Review 3:
[('amazing', 0.6302254388004296), ('buy', 0.5528513762159981), ('would', 0.49795360515460996), ('product', 0.22184106747491508)]

Review 4:
[('slow', 0.6127980527597108), ('good', 0.5375635859059185), ('delivery', 0.5375635859059185), ('product', 0.2157065802191654)]


In [32]:
outdir = r"C:\Users\vedha\OneDrive\Desktop\Assignment - 5.1\outputs"
os.makedirs(outdir, exist_ok=True)

df.to_csv(os.path.join(outdir, "product_reviews_preprocessed.csv"), index=False, encoding="utf-8")

pd.DataFrame(TFIDF, columns=vocab).to_csv(
    os.path.join(outdir, "tfidf_matrix.csv"), index=False, encoding="utf-8"
)

print("✅ Files saved to:", outdir)


✅ Files saved to: C:\Users\vedha\OneDrive\Desktop\Assignment - 5.1\outputs
