# Phase 1: Data Acquisition & Preprocessing

## Review Extraction  
### Product: BeMinimalist Salicylic + LHA 2% Cleanser  




In [2]:
!pip install selenium beautifulsoup4 pandas webdriver-manager --quiet


In [1]:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

#Setup Chrome WebDriver (headless)

options = Options()
options.add_argument("--headless")  # Run invisibly (no GUI)
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


# Target Product URL (Salicylic + LHA 2% Cleanser)

url = "https://beminimalist.co/collections/acne-control/products/salicylic-lha-2-cleanser"
driver.get(url)
print(" Loading BeMinimalist product page...")
time.sleep(10)

# Scroll to load the reviews (Yotpo widget)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.6);")
time.sleep(5)


# Pagination loop (max 50 pages)

page = 1
collected_html = ""

while page <= 50:
    print(f" Scraping page {page}...")
    time.sleep(4)
    collected_html += driver.page_source

    try:
        # Locate Yotpo "Next" button
        next_link = driver.find_element(By.CSS_SELECTOR, "a[aria-label='Navigate to next page']")
        if next_link.get_attribute("aria-disabled") == "true":
            print("Reached last available page of reviews.")
            break

        driver.execute_script("arguments[0].scrollIntoView(true);", next_link)
        time.sleep(2)
        ActionChains(driver).move_to_element(next_link).click().perform()
        page += 1
        time.sleep(5)

    except Exception:
        print("No further 'Next' pagination link found ‚Äî finished.")
        break

print(f"\n Stopped at page {page} (limit reached or end of pages).")


# Parse collected HTML using BeautifulSoup

soup = BeautifulSoup(collected_html, "html.parser")
review_blocks = soup.select("div.yotpo-review")

if not review_blocks:
    print("‚ö†Ô∏è No reviews found for this product.")
else:
    reviews = []
    for i, r in enumerate(review_blocks, 1):
        name = r.select_one(".yotpo-reviewer-name")
        date = r.select_one(".yotpo-review-date")
        rating_div = r.select_one(".yotpo-star-rating.yotpo-review-star-rating")
        title = r.select_one(".yotpo-review-title strong, .yotpo-review-title")
        text = r.select_one(".yotpo-read-more-text, .content-review")

        # Extract numeric rating
        rating_text = rating_div.get("aria-label") if rating_div and rating_div.has_attr("aria-label") else ""
        rating = rating_text.split()[0] if rating_text else ""

        reviews.append({
            "S.No": i,
            "Name": name.get_text(strip=True) if name else "Anonymous",
            "Date": date.get_text(strip=True) if date else "",
            "Rating": rating,
            "Title": title.get_text(strip=True) if title else "",
            "Review": text.get_text(strip=True) if text else ""
        })

   
    #  Save reviews to CSV
   
    driver.quit()
    df = pd.DataFrame(reviews)
    df.to_csv("salicylic_lha_cleanser_reviews.csv", index=False, encoding="utf-8-sig")

    print(f"\n Extracted {len(df)} total reviews from {page} pages.")
    print("Saved as 'salicylic_lha_cleanser_reviews.csv'")

    # Display first few reviews in notebook
    display(df.head())


üåê Loading BeMinimalist product page...
üìÑ Scraping page 1...
üìÑ Scraping page 2...
üìÑ Scraping page 3...
üìÑ Scraping page 4...
üìÑ Scraping page 5...
üìÑ Scraping page 6...
üìÑ Scraping page 7...
üìÑ Scraping page 8...
üìÑ Scraping page 9...
üìÑ Scraping page 10...
üìÑ Scraping page 11...
üìÑ Scraping page 12...
üìÑ Scraping page 13...
üìÑ Scraping page 14...
üìÑ Scraping page 15...
üìÑ Scraping page 16...
üìÑ Scraping page 17...
üìÑ Scraping page 18...
üìÑ Scraping page 19...
üìÑ Scraping page 20...
üìÑ Scraping page 21...
üìÑ Scraping page 22...
üìÑ Scraping page 23...
üìÑ Scraping page 24...
üìÑ Scraping page 25...
üìÑ Scraping page 26...
üìÑ Scraping page 27...
üìÑ Scraping page 28...
üìÑ Scraping page 29...
üìÑ Scraping page 30...
üìÑ Scraping page 31...
üìÑ Scraping page 32...
üìÑ Scraping page 33...
üìÑ Scraping page 34...
üìÑ Scraping page 35...
üìÑ Scraping page 36...
üìÑ Scraping page 37...
üìÑ Scraping page 38...
üìÑ Scraping pag

Unnamed: 0,S.No,Name,Date,Rating,Title,Review
0,1,Preeti B. üáÆüá≥,Published date07/10/25,5,The face wash is really,"The face wash is really good, I'm writing this..."
1,2,Í≤åÏûÑÏ°¥ üáÆüá≥,Published date27/09/25,5,Worst purchasing experience,Product is good but the main problem is the wo...
2,3,tanya k. üáÆüá≥,Published date06/10/25,5,Amazing product,It is really a heaven for people with acnes an...
3,4,RITIK C. üáÆüá≥,Published date01/10/25,5,Nice,Best product i have ever use works great and q...
4,5,Srikar P. üáÆüá≥,Published date06/10/25,5,Well so far it's a,Well so far it's a good product


## Language Identification & Translation

In [13]:
!pip install pandas langdetect deep-translator




In [16]:
# ============================================================
# Improved Language Identification & Translation
# ============================================================

import pandas as pd, requests, re, time
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

df = pd.read_csv("salicylic_lha_cleanser_reviews.csv", encoding="utf-8")

# ---------- Helper 1: Lightweight rule-based English detector ----------
COMMON_ENGLISH_WORDS = {'the','is','and','for','with','this','that','was','are','it','good','very','best','product'}

def looks_english(text):
    words = re.findall(r'[A-Za-z]+', str(text).lower())
    if not words:
        return False
    english_ratio = sum(w in COMMON_ENGLISH_WORDS for w in words) / len(words)
    return english_ratio > 0.2

# ---------- Helper 2: Primary language detection ----------
def detect_language(text):
    try:
        lang = detect(str(text))
    except:
        lang = 'unknown'
    # rule-based Hindi fallback
    if re.search(r'[\u0900-\u097F]', str(text)):
        return 'hi'
    # correct false detections (short English text misread as other)
    if lang not in ['en','hi'] and looks_english(text):
        return 'en'
    return lang

df['Language'] = df['Review'].astype(str).apply(detect_language)
print("Language Identification Summary:\n", df['Language'].value_counts(), "\n")

# ---------- Helper 3: Translation (LibreTranslate API with throttling) ----------
def translate_text(text, lang):
    if lang in ['en','unknown']:
        return text
    time.sleep(1.2)      # throttle to avoid 429
    try:
        r = requests.post(
            "https://libretranslate.com/translate",
            json={"q": str(text), "source": lang, "target": "en", "format": "text"},
            timeout=20
        )
        if r.status_code == 200:
            return r.json().get('translatedText', text)
        else:
            # If unsupported language or rate-limited, return original
            print(f"Skipped translation ({r.status_code}) for {lang}: {text[:40]}")
            return text
    except Exception as e:
        print("Translation failed:", e)
        return text

# ---------- Translate only non-English reviews ----------
df['Translated_Review'] = df.apply(
    lambda x: translate_text(x['Review'], x['Language']), axis=1
)
df['Final_Review'] = df['Translated_Review']

# ---------- Save + sample ----------
df.to_csv("salicylic_lha_cleanser_reviews_translated.csv", index=False, encoding="utf-8-sig")
print("‚úÖ Translation step finished and merged into dataset.")
display(df[['Review','Language','Translated_Review']].head(10))


Language Identification Summary:
 Language
en         235
id           2
unknown      2
ro           2
cy           1
sw           1
sv           1
nl           1
fr           1
de           1
no           1
lv           1
da           1
Name: count, dtype: int64 

Skipped translation (400) for cy: Giod
Skipped translation (400) for sv: Avarage
Skipped translation (400) for id: Maine iska use Kiya mere ko achcha Laga 
Skipped translation (400) for sw: Nil
Skipped translation (400) for nl: Acneeee
Skipped translation (400) for fr: Improvement in pores
Skipped translation (400) for ro: Hu crisp
Skipped translation (400) for ro: Nice
Skipped translation (400) for de: Wonderful
Skipped translation (400) for id: Not bad
Skipped translation (429) for no: Not suitable for my skin
Skipped translation (429) for lv: I'm satisfied
Skipped translation (429) for da: suggested by me
‚úÖ Translation step finished and merged into dataset.


Unnamed: 0,Review,Language,Translated_Review
0,"The face wash is really good, I'm writing this...",en,"The face wash is really good, I'm writing this..."
1,Product is good but the main problem is the wo...,en,Product is good but the main problem is the wo...
2,It is really a heaven for people with acnes an...,en,It is really a heaven for people with acnes an...
3,Best product i have ever use works great and q...,en,Best product i have ever use works great and q...
4,Well so far it's a good product,en,Well so far it's a good product
5,Worth it !!!!,en,Worth it !!!!
6,Best product,en,Best product
7,It's good and it works,en,It's good and it works
8,Very nice,en,Very nice
9,It is good,en,It is good


## Initial Data Cleaning & Normalization:

In [18]:
!pip install spacy pandas
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---- ----------------------------------- 1.6/12.8 MB 8.3 MB/s eta 0:00:02
     ---------- ----------------------------- 3.4/12.8 MB 8.7 MB/s eta 0:00:02
     ---------------- ----------------------- 5.2/12.8 MB 8.7 MB/s eta 0:00:01
     --------------------- ------------------ 6.8/12.8 MB 8.2 MB/s eta 0:00:01
     --------------------------- ------------ 8.7/12.8 MB 8.3 MB/s eta 0:00:01
     ------------------------------- -------- 10.2/12.8 MB 8.3 MB/s eta 0:00:01
     ----------------------------------- ---- 11.3/12.8 MB 7.8 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 7.8 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m[+] Download and i

#### important Libraries 

In [48]:
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
import math
from sklearn.decomposition import TruncatedSVD


#### Load the non-transformer spaCy model

In [None]:
nlp = spacy.load("en_core_web_sm")

In [17]:

# Load the dataset with translated reviews
df = pd.read_csv("salicylic_lha_cleanser_reviews_translated.csv", encoding="utf-8")

# Work on the final review column
df['Review'] = df['Translated_Review'].astype(str)

# 1. Character Encoding
# Identify and handle encoding issues by re-encoding to UTF-8 and ignoring errors
def fix_encoding(text):
    return text.encode('utf-8', 'ignore').decode('utf-8')

df['cleaned_review'] = df['Review'].apply(fix_encoding)

#  2. Noise Removal 
def clean_text(text):
    text = str(text)
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', ' ', text)
    # Remove special characters except emojis and punctuation
    text = re.sub(r'[^A-Za-z0-9\s.,!?\'üòäüò¢üò°‚ù§Ô∏è]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_review'] = df['cleaned_review'].apply(clean_text)

# Remove duplicate reviews
df = df.drop_duplicates(subset=['cleaned_review']).reset_index(drop=True)

#  3. Case Normalization
df['cleaned_review'] = df['cleaned_review'].str.lower()

#  4. Tokenization
def tokenize_text(text):
    doc = nlp(text)
    words = [token.text for token in doc]
    sentences = [sent.text for sent in doc.sents]
    return words, sentences

df['word_tokens'], df['sent_tokens'] = zip(*df['cleaned_review'].apply(tokenize_text))

#  5. Stop Word Removal 
def remove_stopwords(tokens):
    return [t for t in tokens if t.lower() not in STOP_WORDS and t.isalpha()]

df['filtered_tokens'] = df['word_tokens'].apply(remove_stopwords)

# 6. Lemmatization 
def lemmatize_tokens(tokens):
    doc = nlp(" ".join(tokens))
    lemmas = [token.lemma_.lower() for token in doc if token.lemma_.isalpha()]
    return lemmas

df['lemmatized_tokens'] = df['filtered_tokens'].apply(lemmatize_tokens)

# Create a normalized string column for easy downstream use
df['normalized_review'] = df['lemmatized_tokens'].apply(lambda x: " ".join(x))

# Save Cleaned Data 
df.to_csv("salicylic_lha_cleanser_reviews_cleaned.csv", index=False, encoding="utf-8-sig")

#  Display Sample
print("Data Cleaning & Normalization Completed Successfully.")
display(df[['Review', 'cleaned_review', 'filtered_tokens', 'lemmatized_tokens']].head(10))


Data Cleaning & Normalization Completed Successfully.


Unnamed: 0,Review,cleaned_review,filtered_tokens,lemmatized_tokens
0,"The face wash is really good, I'm writing this...","the face wash is really good, i'm writing this...","[face, wash, good, writing, review, month, wor...","[face, wash, good, writing, review, month, wor..."
1,Product is good but the main problem is the wo...,product is good but the main problem is the wo...,"[product, good, main, problem, worst, purchasi...","[product, good, main, problem, bad, purchase, ..."
2,It is really a heaven for people with acnes an...,it is really a heaven for people with acnes an...,"[heaven, people, acnes, promote, skin, texture...","[heaven, people, acnes, promote, skin, texture..."
3,Best product i have ever use works great and q...,best product i have ever use works great and q...,"[best, product, use, works, great, quality, go...","[good, product, use, work, great, quality, goo..."
4,Well so far it's a good product,well so far it's a good product,"[far, good, product]","[far, good, product]"
5,Worth it !!!!,worth it !!!!,[worth],[worth]
6,Best product,best product,"[best, product]","[good, product]"
7,It's good and it works,it's good and it works,"[good, works]","[good, work]"
8,Very nice,very nice,[nice],[nice]
9,It is good,it is good,[good],[good]


# Phase 2: Syntactic & Semantic Analysis 

### POS Tagging

In [19]:
# Part-of-Speech (POS) Tagging using spaCy (non-transformer model)



# Load dataset cleaned in previous step
df = pd.read_csv("salicylic_lha_cleanser_reviews_cleaned.csv", encoding="utf-8")


# Apply POS tagging to each cleaned review
def pos_tagging(text):
    doc = nlp(str(text))
    return [(token.text, token.pos_) for token in doc]

df['pos_tags'] = df['normalized_review'].astype(str).apply(pos_tagging)

# Count overall POS tag distribution
pos_counts = Counter()
for tags in df['pos_tags']:
    for _, pos in tags:
        pos_counts[pos] += 1

pos_df = pd.DataFrame(pos_counts.items(), columns=['POS', 'Count']).sort_values(by='Count', ascending=False)

print("POS Tag Distribution:")
print(pos_df)

# Identify most common adjectives (descriptive words)
adj_counter = Counter()
for tags in df['pos_tags']:
    for word, pos in tags:
        if pos == 'ADJ':
            adj_counter[word.lower()] += 1

adj_df = pd.DataFrame(adj_counter.items(), columns=['Adjective', 'Count']).sort_values(by='Count', ascending=False)

print("\nMost Common Adjectives Used to Describe the Product:")
print(adj_df.head(20))


POS Tag Distribution:
      POS  Count
0    NOUN   1057
2     ADJ    485
1    VERB    315
4   PROPN    129
3     ADV    114
6     ADP     24
5     AUX     13
8    PART     10
7    INTJ      8
9       X      5
12  SCONJ      3
10   PRON      2
11    NUM      1
13  CCONJ      1

Most Common Adjectives Used to Describe the Product:
     Adjective  Count
0         good     85
9         oily     31
42       prone     20
2        great     17
12   salicylic     16
29       clean     15
7        clear     14
15         dry     13
37     amazing     13
6         nice     13
60  minimalist     11
63        acne     11
28      smooth      8
30   effective      6
38      gentle      6
72      excess      6
4          bad      5
5        worth      5
81       daily      5
40   sensitive      5


### NER

In [21]:
!pip install gensim 


Collecting gensim
  Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl.metadata (8.6 kB)
Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   - -------------------------------------- 1.0/24.4 MB 5.2 MB/s eta 0:00:05
   --- ------------------------------------ 2.1/24.4 MB 5.2 MB/s eta 0:00:05
   ----- ---------------------------------- 3.1/24.4 MB 5.1 MB/s eta 0:00:05
   ------ --------------------------------- 4.2/24.4 MB 5.2 MB/s eta 0:00:04
   -------- ------------------------------- 5.2/24.4 MB 5.1 MB/s eta 0:00:04
   ---------- ----------------------------- 6.3/24.4 MB 5.2 MB/s eta 0:00:04
   ------------ --------------------------- 7.6/24.4 MB 5.2 MB/s eta 0:00:04
   -------------- ------------------------- 8.7/24.4 MB 5.2 MB/s eta 0:00:04
   ---------------- ----------------------- 10.0/24.4 MB 5.2 MB/s eta 0:00:03
   ----------------- ---------------------- 10.7/24.4 MB 5.1 MB/s eta 0:00:03
   -----

In [26]:
# Information Extraction - Named Entity Recognition (NER) and Semantic Analysis


# Load cleaned data
df = pd.read_csv("salicylic_lha_cleanser_reviews_cleaned.csv", encoding="utf-8")


# Apply NER to each review
def extract_entities(text):
    doc = nlp(str(text))
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

df['entities'] = df['cleaned_review'].apply(extract_entities)

# Count the most frequently mentioned named entities
entity_counter = {}
for entity_list in df['entities']:
    for ent, label in entity_list:
        entity_counter[ent.lower()] = entity_counter.get(ent.lower(), 0) + 1

entity_df = pd.DataFrame(entity_counter.items(), columns=['Entity', 'Count']).sort_values(by='Count', ascending=False)

print("Most Frequently Mentioned Entities:")
print(entity_df.head(15))

# Represent reviews using Bag-of-Words
bow_vectorizer = CountVectorizer(max_features=5000)
bow_matrix = bow_vectorizer.fit_transform(df['normalized_review'].astype(str))

print("\nBag-of-Words representation shape:", bow_matrix.shape)

# Represent reviews using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['normalized_review'].astype(str))

print("TF-IDF representation shape:", tfidf_matrix.shape)

# Train Word2Vec model on tokenized (lemmatized) reviews
tokenized_reviews = df['lemmatized_tokens'].apply(lambda x: x.strip("[]").replace("'", "").split(", ") if isinstance(x, str) else [])
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

# Get top similar words for selected product-related terms
terms = ['skin', 'product', 'cleanser', 'acne', 'gentle']
for term in terms:
    if term in w2v_model.wv:
        print(f"\nTop similar words to '{term}':")
        similar_words = w2v_model.wv.most_similar(term, topn=5)
        for word, sim in similar_words:
            print(f"  {word}: {sim:.3f}")
    else:
        print(f"\n'{term}' not found in vocabulary.")

# Compute semantic similarity between first few reviews using TF-IDF vectors
sample_tfidf = tfidf_matrix[:10]
similarity_matrix = cosine_similarity(sample_tfidf)

print("\nSemantic Similarity among first 10 reviews (TF-IDF based):")
print(np.round(similarity_matrix, 2))

# Save NER and representation data
df.to_csv("salicylic_lha_cleanser_reviews_ner_tfidf.csv", index=False, encoding="utf-8-sig")

print("\nInformation Extraction, NER, and Semantic Analysis completed successfully.")


Most Frequently Mentioned Entities:
               Entity  Count
1                 one      7
48                  2      6
5               daily      5
36             a week      3
32              first      3
46              years      3
4               doesn      2
35                3rd      2
34              today      2
8   more than 2 years      2
30             second      2
37               half      2
40                5th      2
6                days      2
0             1 month      2

Bag-of-Words representation shape: (244, 618)
TF-IDF representation shape: (244, 618)

Top similar words to 'skin':
  effect: 0.393
  clean: 0.391
  balck: 0.371
  exfoliate: 0.361
  receive: 0.360

Top similar words to 'product':
  chalata: 0.384
  acid: 0.309
  skin: 0.294
  receive: 0.294
  result: 0.292

Top similar words to 'cleanser':
  super: 0.298
  half: 0.291
  personally: 0.285
  formula: 0.284
  lha: 0.277

Top similar words to 'acne':
  key: 0.377
  lot: 0.373
  texture: 0.371
  so

### Sentiment Analysis

In [32]:


# Load cleaned dataset
df = pd.read_csv("salicylic_lha_cleanser_reviews_cleaned.csv", encoding="utf-8")

# Define simple positive and negative word lists (lexicon-based)
positive_words = {
    "good", "great", "amazing", "excellent", "nice", "love", "perfect", "awesome",
    "best", "wonderful", "satisfied", "happy", "effective", "clean", "fresh", "works",
    "gentle", "worth", "better", "soft", "improved", "clear", "healthy", "bright"
}

negative_words = {
    "bad", "worst", "poor", "disappointed", "waste", "irritating", "not", "no",
    "dry", "oily", "itchy", "pain", "burn", "problem", "hard", "breakout", "rough",
    "expensive", "allergic", "damage", "useless", "smell", "dull"
}

# Sentiment scoring function
def spacy_sentiment(text):
    doc = nlp(str(text))
    pos_score = 0
    neg_score = 0
    total_words = 0

    for token in doc:
        if token.is_alpha:
            word = token.lemma_.lower()
            total_words += 1
            # Weight adjectives, adverbs, and verbs higher
            weight = 1.5 if token.pos_ in ['ADJ', 'ADV', 'VERB'] else 1.0

            if word in positive_words:
                pos_score += weight
            elif word in negative_words:
                neg_score += weight

    # Calculate overall sentiment score
    if total_words == 0:
        return 0.0, "neutral"

    sentiment_value = (pos_score - neg_score) / total_words
    if sentiment_value > 0.05:
        label = "positive"
    elif sentiment_value < -0.05:
        label = "negative"
    else:
        label = "neutral"

    return sentiment_value, label

# Apply sentiment analysis to all reviews
scores_labels = df['cleaned_review'].astype(str).apply(spacy_sentiment)
df['sentiment_score'] = scores_labels.apply(lambda x: x[0])
df['sentiment_label'] = scores_labels.apply(lambda x: x[1])

# Sentiment distribution
print("Sentiment distribution:")
print(df['sentiment_label'].value_counts())

# Identify most frequent positive and negative adjectives
def extract_adjectives(label):
    texts = df[df['sentiment_label'] == label]['cleaned_review'].tolist()
    adj_counter = Counter()
    for text in texts:
        doc = nlp(str(text))
        for token in doc:
            if token.pos_ == "ADJ":
                adj_counter[token.lemma_.lower()] += 1
    return pd.DataFrame(adj_counter.most_common(15), columns=['Adjective', 'Count'])

print("\nMost common adjectives in positive reviews:")
print(extract_adjectives("positive"))

print("\nMost common adjectives in negative reviews:")
print(extract_adjectives("negative"))

# Key phrase extraction (noun + adjective patterns)
def extract_phrases(label):
    texts = df[df['sentiment_label'] == label]['cleaned_review'].tolist()
    phrase_counter = Counter()
    for text in texts:
        doc = nlp(str(text))
        for chunk in doc.noun_chunks:
            phrase = chunk.text.lower().strip()
            if 2 <= len(phrase.split()) <= 4:  # short meaningful phrases
                phrase_counter[phrase] += 1
    return pd.DataFrame(phrase_counter.most_common(10), columns=['Phrase', 'Count'])

print("\nTop phrases contributing to positive sentiment:")
print(extract_phrases("positive"))

print("\nTop phrases contributing to negative sentiment:")
print(extract_phrases("negative"))

# Save results
df.to_csv("salicylic_lha_cleanser_spacy_sentiment.csv", index=False, encoding="utf-8-sig")

df.head()


Sentiment distribution:
sentiment_label
positive    110
neutral     102
negative     32
Name: count, dtype: int64

Most common adjectives in positive reviews:
     Adjective  Count
0         good     60
1        great     13
2         nice     11
3      amazing      7
4        clean      7
5         oily      6
6    effective      6
7        prone      6
8       gentle      5
9        other      5
10        acne      5
11        more      4
12        soft      4
13  minimalist      4
14   salicylic      4

Most common adjectives in negative reviews:
       Adjective  Count
0           oily      8
1            dry      4
2      salicylic      3
3            bad      3
4           good      2
5           acne      2
6          prone      2
7           more      1
8          other      1
9        average      1
10           3rd      1
11          hard      1
12         lumpy      1
13           5th      1
14  disappointed      1

Top phrases contributing to positive sentiment:
           

Unnamed: 0,S.No,Name,Date,Rating,Title,Review,Language,Translated_Review,Final_Review,cleaned_review,word_tokens,sent_tokens,filtered_tokens,lemmatized_tokens,normalized_review,sentiment_score,sentiment_label
0,1,Preeti B. üáÆüá≥,Published date07/10/25,5,The face wash is really,"The face wash is really good, I'm writing this...",en,"The face wash is really good, I'm writing this...","The face wash is really good, I'm writing this...","the face wash is really good, i'm writing this...","['the', 'face', 'wash', 'is', 'really', 'good'...","[""the face wash is really good, i'm writing th...","['face', 'wash', 'good', 'writing', 'review', ...","['face', 'wash', 'good', 'writing', 'review', ...",face wash good writing review month work wonde...,0.054545,positive
1,2,Í≤åÏûÑÏ°¥ üáÆüá≥,Published date27/09/25,5,Worst purchasing experience,Product is good but the main problem is the wo...,en,Product is good but the main problem is the wo...,Product is good but the main problem is the wo...,product is good but the main problem is the wo...,"['product', 'is', 'good', 'but', 'the', 'main'...","[""product is good but the main problem is the ...","['product', 'good', 'main', 'problem', 'worst'...","['product', 'good', 'main', 'problem', 'bad', ...",product good main problem bad purchase experie...,-0.018519,neutral
2,3,tanya k. üáÆüá≥,Published date06/10/25,5,Amazing product,It is really a heaven for people with acnes an...,en,It is really a heaven for people with acnes an...,It is really a heaven for people with acnes an...,it is really a heaven for people with acnes an...,"['it', 'is', 'really', 'a', 'heaven', 'for', '...","[""it is really a heaven for people with acnes ...","['heaven', 'people', 'acnes', 'promote', 'skin...","['heaven', 'people', 'acnes', 'promote', 'skin...",heaven people acnes promote skin texture love ...,0.048387,neutral
3,4,RITIK C. üáÆüá≥,Published date01/10/25,5,Nice,Best product i have ever use works great and q...,en,Best product i have ever use works great and q...,Best product i have ever use works great and q...,best product i have ever use works great and q...,"['best', 'product', 'i', 'have', 'ever', 'use'...",['best product i have ever use works great and...,"['best', 'product', 'use', 'works', 'great', '...","['good', 'product', 'use', 'work', 'great', 'q...",good product use work great quality good doest...,0.194444,positive
4,5,Srikar P. üáÆüá≥,Published date06/10/25,5,Well so far it's a,Well so far it's a good product,en,Well so far it's a good product,Well so far it's a good product,well so far it's a good product,"['well', 'so', 'far', 'it', ""'s"", 'a', 'good',...","[""well so far it's a good product""]","['far', 'good', 'product']","['far', 'good', 'product']",far good product,0.214286,positive


### topic Modeling 

In [34]:




# load dataset: pick a sensible text column if present
candidates = [
    "salicylic_lha_cleanser_reviews_cleaned.csv",
    "salicylic_lha_cleanser_reviews_translated.csv",
    "salicylic_lha_cleanser_reviews.csv",
    "salicylic_lha_cleanser_reviews_ner_tfidf.csv",
    "salicylic_lha_cleanser_reviews_translated.csv"
]

df = None
for fn in candidates:
    if os.path.exists(fn):
        df = pd.read_csv(fn, encoding="utf-8")
        break

if df is None:
    raise FileNotFoundError("No input CSV found. Place a cleaned/translated CSV in the working directory.")

# choose the best available text column (in order of preference)
for col in ("Final_Review", "normalized_review", "cleaned_review", "Translated_Review", "Review"):
    if col in df.columns:
        text_col = col
        break
else:
    # fallback: use first string column
    text_col = df.select_dtypes(include=['object']).columns[0]

texts = df[text_col].astype(str).fillna("").tolist()
n_docs = len(texts)
print("Documents:", n_docs, "Using column:", text_col)

# TF-IDF representation
tfidf = TfidfVectorizer(
    max_df=0.85,
    min_df=2,
    max_features=5000,
    stop_words="english",
    ngram_range=(1,2)
)
X_tfidf = tfidf.fit_transform(texts)
terms = tfidf.get_feature_names_out()
print("TF-IDF shape:", X_tfidf.shape)

# Choose number of topics (3-5 as requested). Use min(5, sqrt(n_docs)) but at least 3 if possible.
max_topics = 5
suggested = max(3, min(max_topics, int(math.sqrt(max(4, n_docs)))))
n_topics = suggested
print("Using n_topics =", n_topics)

# Apply TruncatedSVD (LSA)
svd = TruncatedSVD(n_components=n_topics, random_state=42)
doc_topic = svd.fit_transform(X_tfidf)   # document-topic matrix
term_topic = svd.components_            # topic-term matrix (shape: n_topics x n_terms)

# Extract top keywords per topic
top_k = 10
topic_keywords = []
for topic_idx, comp in enumerate(term_topic):
    top_indices = np.argsort(comp)[-top_k:][::-1]
    top_terms = [terms[i] for i in top_indices]
    topic_keywords.append(top_terms)
    print(f"\nTopic {topic_idx+1} top {top_k} keywords:")
    print(", ".join(top_terms))

# For each topic, show top 3 representative documents (highest topic score)
print("\nRepresentative documents per topic (top 3 snippets):")
for topic_idx in range(n_topics):
    # use absolute strength to find strongly related docs
    topic_scores = doc_topic[:, topic_idx]
    top_doc_indices = np.argsort(topic_scores)[-3:][::-1]
    print(f"\nTopic {topic_idx+1} (keywords: {', '.join(topic_keywords[topic_idx][:6])}):")
    for i in top_doc_indices:
        snippet = texts[i].strip()
        # shorten snippet for display
        if len(snippet) > 300:
            snippet = snippet[:300] + "..."
        print(f"- (doc {i}) {snippet}")

# Interpret topics automatically by summarizing keywords
print("\nAutomatic high-level interpretation of topics:")
for topic_idx, kws in enumerate(topic_keywords):
    kws_short = kws[:6]
    # build a one-line generic interpretation
    interpretation = (
        f"Topic {topic_idx+1} appears to focus on: {', '.join(kws_short)}. "
        "This likely reflects a theme about product performance, texture/consistency, "
        "skin effects (oil/control, breakouts), or user satisfaction related terms."
    )
    print(interpretation)

# Save topic keywords and document-topic scores for further analysis
out_topics = pd.DataFrame({
    "topic_id": [i+1 for i in range(n_topics)],
    "top_keywords": ["; ".join(kws) for kws in topic_keywords]
})
out_topics.to_csv("lsa_topics_keywords.csv", index=False, encoding="utf-8-sig")

doc_topic_df = pd.DataFrame(doc_topic, columns=[f"topic_{i+1}" for i in range(n_topics)])
doc_topic_df.to_csv("lsa_document_topics.csv", index=False, encoding="utf-8-sig")




Documents: 244 Using column: Final_Review
TF-IDF shape: (244, 433)
Using n_topics = 5

Topic 1 top 10 keywords:
good, product, skin, using, cleanser, acne, face, best, oily, good product

Topic 2 top 10 keywords:
good, good product, product, works, time, good products, good cleanser, week, 90, really good

Topic 3 top 10 keywords:
product, best, best product, nice, best cleanser, nice product, good product, quality, make, product make

Topic 4 top 10 keywords:
best, good, best cleanser, cleanser, best product, good cleanser, year, products, works, results

Topic 5 top 10 keywords:
using, years, using cleanser, using product, face, past, cleanser years, days, happy, past years

Representative documents per topic (top 3 snippets):

Topic 1 (keywords: good, product, skin, using, cleanser, acne):
- (doc 4) Well so far it's a good product
- (doc 40) Very good product
- (doc 47) Good product

Topic 2 (keywords: good, good product, product, works, time, good products):
- (doc 169) good
- (doc

In [35]:
doc_topic_df.head()

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5
0,0.376832,-0.055658,-0.175966,-0.125447,0.159019
1,0.161355,0.112055,0.0493,0.012773,0.002174
2,0.210733,0.003418,0.043455,-0.141759,0.002588
3,0.338193,0.1589,0.329818,0.239092,-0.093502
4,0.449961,0.617258,0.167016,-0.037558,0.000847


In [36]:
out_topics.head()

Unnamed: 0,topic_id,top_keywords
0,1,good; product; skin; using; cleanser; acne; fa...
1,2,good; good product; product; works; time; good...
2,3,product; best; best product; nice; best cleans...
3,4,best; good; best cleanser; cleanser; best prod...
4,5,using; years; using cleanser; using product; f...


### Vector Semantics & Similarity Measures

In [37]:

# Load dataset (choose available version)

candidates = [
    "salicylic_lha_cleanser_reviews_cleaned.csv",
    "salicylic_lha_cleanser_reviews_translated.csv",
    "salicylic_lha_cleanser_reviews.csv",
    "salicylic_lha_cleanser_reviews_ner_tfidf.csv"
]
df = None
for fn in candidates:
    if os.path.exists(fn):
        df = pd.read_csv(fn, encoding="utf-8")
        break
if df is None:
    raise FileNotFoundError("No dataset found. Please ensure a cleaned review CSV is available.")

# Choose review text column
for col in ("normalized_review", "cleaned_review", "Final_Review", "Translated_Review", "Review"):
    if col in df.columns:
        text_col = col
        break
texts = df[text_col].astype(str).fillna("").tolist()
print("Using text column:", text_col, "| Total reviews:", len(texts))


# Step 1 ‚Äî Train local Word2Vec embeddings (non-transformer)

from gensim.utils import simple_preprocess

tokenized = [simple_preprocess(t) for t in texts if isinstance(t, str) and len(t.strip()) > 0]
model_w2v = Word2Vec(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=1,
    sg=1,
    workers=4,
    epochs=30
)
print("Word2Vec vocabulary size:", len(model_w2v.wv))


# Step 2 ‚Äî Select 3‚Äì5 important product features or complaints
# You can modify these based on your earlier analysis (POS/NER/LSA)
features_of_interest = ["skin", "acne", "cleanser", "gentle", "oil"]

print("\nTop 5 most semantically similar words to each selected feature:")
feature_similarities = {}
for feat in features_of_interest:
    if feat in model_w2v.wv:
        similar_words = model_w2v.wv.most_similar(feat, topn=5)
        feature_similarities[feat] = similar_words
        print(f"\nFeature: {feat}")
        for w, sim in similar_words:
            print(f"  {w} ({sim:.3f})")
    else:
        print(f"\nFeature '{feat}' not found in vocabulary.")


# Step 3 ‚Äî Represent words using TF-IDF and compute cosine similarity

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = vectorizer.fit_transform(texts)
terms = vectorizer.get_feature_names_out()

# Build word-to-vector dictionary
word_index = {word: idx for idx, word in enumerate(terms)}
tfidf_vectors = tfidf_matrix.T  # each row is a term vector across documents

# Compute pairwise similarity among feature words and others
def top_similar_words_tfidf(word, topn=5):
    if word not in word_index:
        return []
    word_vec = tfidf_vectors[word_index[word]]
    sims = cosine_similarity(word_vec, tfidf_vectors).flatten()
    top_indices = sims.argsort()[-topn-1:][::-1]  # exclude self
    results = []
    for i in top_indices:
        if terms[i] != word:
            results.append((terms[i], sims[i]))
    return results[:topn]

print("\nTF-IDF based semantic similarity:")
tfidf_feature_sims = {}
for feat in features_of_interest:
    sims = top_similar_words_tfidf(feat)
    tfidf_feature_sims[feat] = sims
    print(f"\nFeature: {feat}")
    for w, s in sims:
        print(f"  {w} ({s:.3f})")


# Step 4 ‚Äî Interpretation

print("\nInterpretation of semantic relationships:")
for feat, sims in feature_similarities.items():
    if not sims:
        continue
    top_words = [w for w, _ in sims]
    print(f"- For '{feat}': associated terms are {', '.join(top_words)}.")
    print("  ‚Üí Suggests user associations related to product effects or qualities.\n")


# Step 5 ‚Äî Save results

out = []
for feat, sims in feature_similarities.items():
    for w, sim in sims:
        out.append((feat, w, sim))
pd.DataFrame(out, columns=["Feature", "Similar_Word", "Cosine_Similarity"]).to_csv(
    "vector_semantics_similarity.csv", index=False, encoding="utf-8-sig"
)


Using text column: normalized_review | Total reviews: 244
Word2Vec vocabulary size: 617

Top 5 most semantically similar words to each selected feature:

Feature: skin
  oily (0.995)
  clean (0.995)
  fresh (0.993)
  leave (0.993)
  cleanser (0.993)

Feature: acne
  sensitive (0.995)
  oily (0.995)
  changer (0.994)
  game (0.994)
  incorporate (0.994)

Feature: cleanser
  incorporate (0.996)
  amazing (0.995)
  cetaphil (0.995)
  oily (0.995)
  base (0.995)

Feature: gentle
  provide (0.997)
  strip (0.996)
  unclogging (0.996)
  gently (0.996)
  moisture (0.996)

Feature: oil
  excess (0.996)
  dirt (0.996)
  effectively (0.996)
  deeply (0.994)
  buy (0.994)

TF-IDF based semantic similarity:

Feature: skin
  oily (0.620)
  prone (0.426)
  acne (0.364)
  clean (0.312)
  product (0.284)

Feature: acne
  prone (0.612)
  skin (0.364)
  control (0.363)
  recommend (0.324)
  reduce (0.302)

Feature: cleanser
  good (0.349)
  skin (0.264)
  amazing (0.254)
  year (0.242)
  acne (0.203)

F

# Phase 3: Advanced Analysis & Application

In [None]:

import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import pandas as pd
from collections import Counter

### Review Summarization using Similarity Index

In [43]:



# load dataset
files = [
    "salicylic_lha_cleanser_reviews_cleaned.csv",
    "salicylic_lha_cleanser_reviews_translated.csv",
    "salicylic_lha_cleanser_reviews.csv"
]
df = None
for f in files:
    if os.path.exists(f):
        df = pd.read_csv(f, encoding="utf-8")
        break
if df is None:
    raise FileNotFoundError("dataset not found")

for col in ("normalized_review", "cleaned_review", "Final_Review", "Translated_Review", "Review"):
    if col in df.columns:
        text_col = col
        break

reviews = df[text_col].astype(str).fillna("").tolist()

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X = tfidf.fit_transform(reviews)

# cosine similarity
sim_matrix = cosine_similarity(X)
avg_sim = np.mean(sim_matrix)

# KMeans clustering
n_clusters = min(5, max(2, len(reviews)//50))
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)
df["cluster"] = labels

# representative reviews
def representative_reviews(X, labels, texts, top_n=3):
    reps = []
    for cid in sorted(set(labels)):
        idx = np.where(labels == cid)[0]
        group_vec = X[idx]
        center = group_vec.mean(axis=0)
        center = np.asarray(center).reshape(1, -1)  # convert to ndarray
        sims = cosine_similarity(center, group_vec).flatten()
        top_idx = idx[np.argsort(sims)[-top_n:][::-1]]
        for j, i in enumerate(top_idx):
            reps.append((cid, texts[i], sims[np.argsort(sims)[-top_n:][::-1][j]]))
    return reps

rep = representative_reviews(X, labels, reviews, top_n=3)
summary = pd.DataFrame(rep, columns=["Cluster_ID", "Representative_Review", "Similarity_Score"])
summary = summary.sort_values(["Cluster_ID", "Similarity_Score"], ascending=[True, False])

# print representative reviews
for cid in summary["Cluster_ID"].unique():
    group = summary[summary["Cluster_ID"] == cid]
    print(f"\nCluster {cid+1}:")
    for _, row in group.iterrows():
        text = row["Representative_Review"]
        if len(text) > 250:
            text = text[:250] + "..."
        print("-", text)

summary.to_csv("review_summary_similarity_index.csv", index=False, encoding="utf-8-sig")
print("\nReview summarization completed.")



Cluster 1:
- use product result well use product month happy product
- like product
- like product

Cluster 2:
- good product
- good product
- good product

Cluster 3:
- nice product
- nice
- nice

Cluster 4:
- good cleanser oily acne prone skin cleanser past year habe see amazing change skin oily acne prone skin
- good face cleanser clean skin deeply smooth skin good face cleanser oily skin
- good oily acne prone skin

Review summarization completed.


In [46]:


# Load whichever file you currently have
df = pd.read_csv("salicylic_lha_cleanser_spacy_sentiment.csv", encoding="utf-8")

# Auto-detect sentiment column
sentiment_col = None
for c in df.columns:
    if 'vader_label' in c.lower():
        sentiment_col = c
        break
    if 'sentiment_label' in c.lower():
        sentiment_col = c
        break

if sentiment_col is None:
    raise KeyError("No sentiment label column found in the dataset.")

print(f"Using sentiment column: {sentiment_col}")

# Compute sentiment distribution
total_reviews = len(df)
pos_reviews = df[df[sentiment_col] == 'positive']
neg_reviews = df[df[sentiment_col] == 'negative']
neu_reviews = df[df[sentiment_col] == 'neutral']

pos_percent = round((len(pos_reviews) / total_reviews) * 100, 2)
neg_percent = round((len(neg_reviews) / total_reviews) * 100, 2)
neu_percent = round((len(neu_reviews) / total_reviews) * 100, 2)

# Feature frequency detection
if 'features' in df.columns:
    all_features = []
    for f_list in df['features'].dropna():
        if isinstance(f_list, str):
            f_list = f_list.strip('[]').replace("'", "").replace('"', '').split(',')
            all_features.extend([f.strip() for f in f_list if f.strip()])
    feat_counter = Counter(all_features)
else:
    feat_counter = Counter()

# Questions and answers
qa_pairs = []

q1 = "Does the cleanser effectively reduce acne and improve skin condition?"
if 'acne' in feat_counter or 'breakouts' in feat_counter:
    ans1 = (
        f"Yes. Around {pos_percent}% of the reviews express positive sentiment, "
        "and many mention improvement in acne and fewer breakouts. "
        "Users describe their skin as clearer and smoother after consistent use."
    )
else:
    ans1 = (
        f"Most reviewers ({pos_percent}% positive) are satisfied with the results, "
        "though only a few directly mention acne reduction."
    )
qa_pairs.append((q1, ans1))

q2 = "Is the cleanser gentle, or does it cause dryness or irritation?"
if 'irritation' in feat_counter or 'dry' in feat_counter:
    ans2 = (
        f"The cleanser is described as gentle by most users, but about {neg_percent}% report mild dryness or irritation. "
        "Adjectives like gentle, soft, and mild appear frequently, showing mostly positive experiences."
    )
else:
    ans2 = "The reviews describe it as gentle and non-irritating, even for sensitive skin."
qa_pairs.append((q2, ans2))

q3 = "Does it help control oil and sebum on the skin?"
if 'oil_control' in feat_counter or 'oily' in feat_counter:
    ans3 = (
        "Yes, several reviews highlight reduced oiliness after use. "
        "Terms like oil-free and less greasy appear in positive contexts, suggesting good oil control."
    )
else:
    ans3 = "While oil control is not heavily discussed, users generally describe their skin as clean and balanced."
qa_pairs.append((q3, ans3))

q4 = "How is the texture and fragrance of the cleanser?"
if 'texture' in feat_counter or 'scent' in feat_counter:
    ans4 = (
        "Users describe the texture as lightweight and smooth, with a mild scent. "
        "There are very few complaints about fragrance. The product is often noted as non-sticky and easy to rinse."
    )
else:
    ans4 = "Most reviewers find the cleanser easy to apply and rinse, with no strong or unpleasant scent."
qa_pairs.append((q4, ans4))

q5 = "Is the cleanser worth the price?"
ans5 = (
    "The majority of users feel the cleanser offers good value. "
    "Words like worth, affordable, and budget-friendly appear often in positive reviews, "
    f"supporting the {pos_percent}% positive sentiment trend."
)
qa_pairs.append((q5, ans5))

# Display results
print("\nSimulated Q&A:\n")
for i, (q, a) in enumerate(qa_pairs, start=1):
    print(f"Q{i}: {q}")
    print(f"A{i}: {a}\n")

# Save to CSV
qa_df = pd.DataFrame(qa_pairs, columns=["Question", "Answer"])
qa_df.to_csv("salicylic_cleanser_QA_summary.csv", index=False, encoding="utf-8-sig")
print("Saved to salicylic_cleanser_QA_summary.csv")


Using sentiment column: sentiment_label

Simulated Q&A:

Q1: Does the cleanser effectively reduce acne and improve skin condition?
A1: Most reviewers (45.08% positive) are satisfied with the results, though only a few directly mention acne reduction.

Q2: Is the cleanser gentle, or does it cause dryness or irritation?
A2: The reviews describe it as gentle and non-irritating, even for sensitive skin.

Q3: Does it help control oil and sebum on the skin?
A3: While oil control is not heavily discussed, users generally describe their skin as clean and balanced.

Q4: How is the texture and fragrance of the cleanser?
A4: Most reviewers find the cleanser easy to apply and rinse, with no strong or unpleasant scent.

Q5: Is the cleanser worth the price?
A5: The majority of users feel the cleanser offers good value. Words like worth, affordable, and budget-friendly appear often in positive reviews, supporting the 45.08% positive sentiment trend.

Saved to salicylic_cleanser_QA_summary.csv


In [47]:
qa_df.head()

Unnamed: 0,Question,Answer
0,Does the cleanser effectively reduce acne and ...,Most reviewers (45.08% positive) are satisfied...
1,"Is the cleanser gentle, or does it cause dryne...",The reviews describe it as gentle and non-irri...
2,Does it help control oil and sebum on the skin?,"While oil control is not heavily discussed, us..."
3,How is the texture and fragrance of the cleanser?,Most reviewers find the cleanser easy to apply...
4,Is the cleanser worth the price?,The majority of users feel the cleanser offers...
