In [1]:
import pandas as pd
import numpy as np
import re
import string
from collections import defaultdict, Counter

In [2]:
import nltk
from nltk.corpus import stopwords, sentiwordnet as swn, wordnet
from nltk import word_tokenize, pos_tag

In [15]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('sentiwordnet')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from google.colab import files
uploaded = files.upload()

Saving headphone - headphone (1).csv to headphone - headphone (1).csv


In [7]:
df = pd.read_csv('headphone - headphone (1).csv')
df

Unnamed: 0,ReviewBody,ReviewStar
0,No doubt it has a great bass and to a great ex...,3
1,"This earphones are unreliable, i bought it be...",1
2,"i bought itfor 999,I purchased it second time,...",4
3,Its sound quality is adorable. overall it was ...,1
4,Its Awesome... Good sound quality & 8-9 hrs ba...,5
...,...,...
4995,Sound quality is superb..Battery life is good....,5
4996,Nice earphone........ Good bt connectivity. An...,5
4997,Stopped working.... Sound is breaking. Pls gui...,1
4998,Awesome product... worth the price and the sou...,5


In [8]:
stop_words = set(stopwords.words('english'))
negation_words = {"not", "no", "never", "n't"}

# Do NOT remove negation words
stop_words = stop_words - negation_words

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["Review_Clean"] = df["ReviewBody"].apply(clean_text)


In [9]:
def rake_keywords(text):
    tokens = word_tokenize(text)

    phrases = []
    current_phrase = []

    for word in tokens:
        if word in stop_words:
            if current_phrase:
                phrases.append(current_phrase)
                current_phrase = []
        else:
            current_phrase.append(word)

    if current_phrase:
        phrases.append(current_phrase)

    word_freq = Counter()
    word_degree = Counter()

    for phrase in phrases:
        degree = len(phrase) - 1
        for word in phrase:
            word_freq[word] += 1
            word_degree[word] += degree

    word_score = {}
    for word in word_freq:
        word_score[word] = (word_degree[word] + word_freq[word]) / word_freq[word]

    phrase_scores = {}
    for phrase in phrases:
        score = sum(word_score[word] for word in phrase)
        phrase_scores[" ".join(phrase)] = score

    return phrase_scores


In [10]:
def extract_aspects(text, top_k=5):
    phrase_scores = rake_keywords(text)
    sorted_phrases = sorted(phrase_scores.items(), key=lambda x: x[1], reverse=True)
    return [p[0] for p in sorted_phrases[:top_k]]


In [11]:
def adjective_sentiment(word):
    synsets = list(swn.senti_synsets(word, 'a'))  # adjective only
    if not synsets:
        return 0.0

    pos_score = np.mean([s.pos_score() for s in synsets])
    neg_score = np.mean([s.neg_score() for s in synsets])

    return pos_score - neg_score


In [12]:
def handle_negation(tokens, sentiments):
    negated = False
    window = 0
    adjusted = []

    for token, score in zip(tokens, sentiments):
        if token in negation_words:
            negated = True
            window = 3
            adjusted.append(0)
        elif negated and window > 0:
            adjusted.append(-score)
            window -= 1
        else:
            adjusted.append(score)

        if window == 0:
            negated = False

    return adjusted


In [13]:
def aspect_sentiment(review, aspect):
    tokens = word_tokenize(review)
    tagged = pos_tag(tokens)

    sentiments = []
    words = []

    for word, tag in tagged:
        if tag.startswith("JJ"):  # adjective
            sentiment = adjective_sentiment(word)
            sentiments.append(sentiment)
            words.append(word)

    if not sentiments:
        return 0.0

    sentiments = handle_negation(words, sentiments)
    return np.mean(sentiments)


In [16]:
aspect_stats = defaultdict(lambda: {"count": 0, "sentiment": 0.0})

for _, row in df.iterrows():
    aspects = extract_aspects(row["Review_Clean"])

    for aspect in aspects:
        score = aspect_sentiment(row["Review_Clean"], aspect)
        aspect_stats[aspect]["count"] += 1
        aspect_stats[aspect]["sentiment"] += score

final_aspects = []
for aspect, stats in aspect_stats.items():
    final_aspects.append({
        "Aspect": aspect,
        "Frequency": stats["count"],
        "Avg_Sentiment": stats["sentiment"] / stats["count"]
    })

aspect_df = pd.DataFrame(final_aspects)
aspect_df.sort_values(by="Frequency", ascending=False).head(15)


Unnamed: 0,Aspect,Frequency,Avg_Sentiment
9,sound quality,476,0.272178
49,good,460,0.432401
99,product,234,0.154957
2511,good product,143,0.426367
41,battery life,120,0.233261
2550,not,109,0.286549
66,not working,96,-0.055536
2259,sound,92,0.270745
2143,price,91,0.345362
1148,bass,90,0.296628


In [18]:
low_rated = df[df["ReviewStar"] <= 3]

aspect_stats_low = defaultdict(lambda: {"count": 0, "sentiment": 0.0})

for _, row in low_rated.iterrows():
    aspects = extract_aspects(row["Review_Clean"])

    for aspect in aspects:
        score = aspect_sentiment(row["Review_Clean"], aspect)
        aspect_stats_low[aspect]["count"] += 1
        aspect_stats_low[aspect]["sentiment"] += score

weaknesses = []
for aspect, stats in aspect_stats_low.items():
    weaknesses.append({
        "Aspect": aspect,
        "Frequency": stats["count"],
        "Avg_Sentiment": stats["sentiment"] / stats["count"]
    })

weakness_df = pd.DataFrame(weaknesses)
weakness_df.sort_values(by="Avg_Sentiment").head(10)


Unnamed: 0,Aspect,Frequency,Avg_Sentiment
4716,complaintmy headphones get malfunctionend,1,-0.75
5289,monthsvery much disappointed,1,-0.75
4671,joggingearbuds comes,1,-0.6875
5130,uncomfortable,1,-0.6875
4670,uncomfortable pice,1,-0.6875
4672,earphonesnot,1,-0.6875
4135,unsatisfied product,1,-0.6875
4669,ears within minutes,1,-0.6875
5380,pls dont buy,1,-0.642857
4261,bad dont buy,1,-0.642857
