In [None]:
!pip install playwright
!playwright install

╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._

In [None]:
import re
import random
import asyncio
import time
import nest_asyncio
import pandas as pd
from playwright.async_api import async_playwright

nest_asyncio.apply()  # Allowing nested async execution inside Colab

async def get_snapdeal_reviews(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        try:
            await page.goto(url, timeout=90000)

            # Extracting product name with error handling
            try:
                product_name = await page.locator("h1").text_content(timeout=60000)
                product_name = product_name.strip() if product_name else "Unknown Product"
            except Exception:
                product_name = "Unknown Product"

            # Waiting for reviews to load
            await page.wait_for_load_state("domcontentloaded", timeout=60000)
            await asyncio.sleep(5)

            reviews = []
            while True:
                # Extracting reviews from the current page
                page_reviews = await page.locator(".user-review").all_text_contents()
                reviews.extend(page_reviews)

                # Checking if "Next" button exists for more reviews
                next_button = await page.locator("button[aria-label='Next']").is_visible()
                if next_button:
                    await page.locator("button[aria-label='Next']").click()
                    await page.wait_for_timeout(5000)  # Wait for the next page to load
                else:
                    break  # No more pages, exit loop

            await browser.close()

            # Removing duplicates
            reviews = list(set(reviews))

            return product_name, reviews

        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
            await browser.close()
            return None, []

# Function to clean reviews
def clean_review(text):
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'by .* on \w+ \d{2}, \d{4} Verified Buyer', '', text)

    # Removing date-only reviews
    if re.fullmatch(r'\d{1,2}-[A-Za-z]{3}-\d{2}|\w{3,9} \d{1,2}, \d{4}', text):
        return None

    # Removing dates that are inside the review
    text = re.sub(r'\b\w{3,9} \d{1,2}, \d{4}\b', '', text)

    return text.strip()

# List of 50 product URLs (unchanged)
product_urls = [
"https://www.snapdeal.com/product/arni-3-in-1-large/658713912149#bcrumbSearch:kitchen",
"https://www.snapdeal.com/product/analog-kitchenware-dark-grey-aluminium/686220079278",
"https://www.snapdeal.com/product/sanjana-silk-grey-georgette-saree/8070451156087011442",
"https://www.snapdeal.com/product/sherine-purple-georgette-saree-with/5764608142053716272#bcrumbLabelId:176",
"https://www.snapdeal.com/product/sitanjali-red-satin-saree-single/653147619600",
"https://www.snapdeal.com/product/gazal-fashions-red-banarasi-silk/5188147395679512622#bcrumbLabelId:176",
"https://www.snapdeal.com/product/rage-gaze-pu-brown-casual/632657477424",
"https://www.snapdeal.com/product/samtroh-pu-beige-mens-regular/648790735271#bcrumbLabelId:46139355",
"https://www.snapdeal.com/product/topware-faux-leather-brown-casual/618782211099#bcrumbLabelId:46139355",
"https://www.snapdeal.com/product/sambhav-deals-pu-tan-formal/670993774775#bcrumbLabelId:46139355",
"https://www.snapdeal.com/product/glorious-pink-cotton-aline-kurti/663061347291",
"https://www.snapdeal.com/product/radiksa-turquoise-cotton-straight-kurti/618611089447",
"https://www.snapdeal.com/product/hetsa-rust-cotton-blend-straight/8646911954626999414",
"https://www.snapdeal.com/product/beauty-berry-3-in1-long/632437546114",
"https://www.snapdeal.com/product/comey-cotton-blend-pink-solids/6341068901840999593",
"https://www.snapdeal.com/product/herbalife-200g-personalized-protein-powder/639121239022",
"https://www.snapdeal.com/product/swiss-beauty-liquid-foundation-light/8070451160909091139",
"https://www.snapdeal.com/product/analog-kichenware-utility-knife-3/5764608148853588121",
"https://www.snapdeal.com/product/latest-chikan-cotton-blend-yellow/6917529706562416229#bcrumbLabelId:191",
"https://www.snapdeal.com/product/deshbandhu-dbk-100-percent-cotton/663003928606#bcrumbLabelId:191",
"https://www.snapdeal.com/product/vida-loca-linen-maroon-shirt/5188147447065960051#bcrumbLabelId:191",
"https://www.snapdeal.com/product/campus-terminator-n-blue-running/8070451213417691561#bcrumbLabelId:255",
"https://www.snapdeal.com/product/asian-lifestyle-gray-casual-shoes/6917529696477217208#bcrumbLabelId:255",
"https://www.snapdeal.com/product/allamwar-12pcs-stainless-steel-cookie/5764608156150767097",
"https://www.snapdeal.com/product/2-in-1-soap-pump/646826407477",
"https://www.snapdeal.com/product/stainless-steel-vestire-powerfree-hand/5764608202055287770",
"https://www.snapdeal.com/product/flyfot-kreative-india-plastic-multipurpose/6917529700943501959",
"https://www.snapdeal.com/product/plastic-quick-cutter-vegetable-cutter/4899917027919989123",
"https://www.snapdeal.com/product/shray-32-in-1-interchangeble/683655001564#bcrumbSearch:laptops",
"https://www.snapdeal.com/product/bhawna-collection-loard-shiv-trishul/672311651336",
"https://www.snapdeal.com/product/milton-thermosteel-1000-ml-flask/1383039",
"https://www.snapdeal.com/product/prd-pu-tan-casual-long/634235191285",
"https://www.snapdeal.com/product/tantra-fluke-car-bluetooth-kit/675090421220#bcrumbLabelId:46102495",
"https://www.snapdeal.com/product/dynamic-store-stainless-steel-kitchen/718714840#bcrumbSearch:kitchen",
"https://www.snapdeal.com/product/masala-rangoli-box-dabba-for/648082979993#bcrumbSearch:kitchen",
"https://www.snapdeal.com/product/hometales-polyproplene-food-container-set/6917529686805260778#bcrumbSearch:kitchen",
"https://www.snapdeal.com/product/kitchen-shelf-storage-rack-self/651274312796#bcrumbSearch:kitchen",
"https://www.snapdeal.com/product/14-in-1-push-up/656865898041#bcrumbLabelId:777",
"https://www.snapdeal.com/product/swiss-beauty-face-primer-cream/643865380404#bcrumbLabelId:3711",
"https://www.snapdeal.com/product/jawline-exerciser-tool/674117063799#bcrumbLabelId:777",
"https://www.snapdeal.com/product/boldfit-push-up-bar-stand/644707316809#bcrumbLabelId:777",
"https://www.snapdeal.com/product/double-spring-tummy-trimmer-pro/623249262781#bcrumbLabelId:777",
"https://www.snapdeal.com/product/swiss-beauty-professional-warm-sand/6917529682500675450#bcrumbLabelId:3711",
"https://www.snapdeal.com/product/sweat-belt-hot-shapers-hot/655284657487#bcrumbLabelId:777",
"https://www.snapdeal.com/product/unical-shock-resistant-cable-protector/643117396448#bcrumbSearch:smartphones",
"https://www.snapdeal.com/product/elv-foldable-universal-tablet-phone/5764608203624461851#bcrumbSearch:smartphones",
"https://www.snapdeal.com/product/hybite-premium-selfie-stick-blue/638464915190#bcrumbSearch:smartphones",
"https://www.snapdeal.com/product/nutriley-un-flavoured-mass-gainer/638070292001#bcrumbSearch:mass%20gainer|bcrumbLabelId:46101962",
"https://www.snapdeal.com/product/intimify-kesar-pista-badam-mass/622640821970#bcrumbSearch:whey|bcrumbLabelId:46101962",
"https://www.snapdeal.com/product/london-glow-setting-powder-white/666659763053#bcrumbSearch:protien|bcrumbLabelId:46101962",
]


# Run the async function for all products
all_reviews = []
for url in product_urls:
    product_name, reviews = asyncio.get_event_loop().run_until_complete(get_snapdeal_reviews(url))

    # Clean reviews and filter out empty ones
    cleaned_reviews = [clean_review(review) for review in reviews]
    cleaned_reviews = [review for review in cleaned_reviews if review]

    # Keep only 10 reviews per product
    cleaned_reviews = cleaned_reviews[:10]

    for review in cleaned_reviews:
        all_reviews.append({
            "Product Name": product_name,
            "Reviews": review,
            # Removed the Useful_in_India placeholder from here
        })

    # Add a delay between products to avoid rate-limiting
    time.sleep(random.uniform(5, 15))

# Saving to CSV
df = pd.DataFrame(all_reviews)
csv_filename = "Snapdeal_Product_Reviews.csv"
df.to_csv(csv_filename, index=False, encoding="utf-8-sig")

print(f"✅ Product reviews saved to: {csv_filename}")

from google.colab import files
files.download(csv_filename)


✅ Product reviews saved to: Snapdeal_Product_Reviews.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import re

# Load the existing CSV file containing reviews
df = pd.read_csv("Snapdeal_Product_Reviews.csv")

# Define keywords for sentiment analysis
positive_keywords = ["good", "great", "perfect", "useful", "value", "worth", "working", "excellent", "satisfied", "awesome", "nice", "recommend", "helpful"]
negative_keywords = ["bad", "worst", "not working", "poor", "useless", "waste", "broke", "disappointed", "damaged", "cheap", "fake", "not working"]

# Function to clean and annotate each review
def clean_review(text):
    text = str(text).lower()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'by .* on \w+ \d{2}, \d{4} Verified Buyer', '', text)
    if re.fullmatch(r'\d{1,2}-[A-Za-z]{3}-\d{2}|\w{3,9} \d{1,2}, \d{4}', text):
        return None
    text = re.sub(r'\b\w{3,9} \d{1,2}, \d{4}\b', '', text)
    return text.strip()

# Function to annotate usefulness of product based on review sentiment
def annotate_useful_in_india(review):
    if not review:
        return "Maybe"
    review = review.lower()
    if any(kw in review for kw in positive_keywords):
        return "Yes"
    elif any(kw in review for kw in negative_keywords):
        return "No"
    else:
        return "Maybe"

# Function to aggregate sentiment for each product
def aggregate_product_usefulness(product_reviews):
    positive_reviews = 0
    negative_reviews = 0
    maybe_reviews = 0

    for review in product_reviews:
        sentiment = annotate_useful_in_india(review)
        if sentiment == "Yes":
            positive_reviews += 1
        elif sentiment == "No":
            negative_reviews += 1
        else:
            maybe_reviews += 1

    if positive_reviews > negative_reviews:
        return "Yes"
    elif negative_reviews > positive_reviews:
        return "No"
    else:
        return "Maybe"

# Group reviews by Product Name
grouped_reviews = df.groupby('Product Name')['Reviews'].apply(list).reset_index()

# Annotate each product based on its reviews
grouped_reviews['Useful_in_India'] = grouped_reviews['Reviews'].apply(lambda reviews: aggregate_product_usefulness(reviews))

# Save the annotated results to a new CSV
grouped_reviews.to_csv("Annotated_Product_Reviews.csv", index=False, encoding="utf-8-sig")

# Optionally, download the file if you're working in Colab
from google.colab import files
files.download("Annotated_Product_Reviews.csv")

print("✅ Product usefulness annotated and saved.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Product usefulness annotated and saved.


In [None]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER lexicon (run once)
nltk.download('vader_lexicon')

# Load dataset
df = pd.read_csv("Snapdeal_Product_Reviews.csv")

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to analyze sentiment
def get_sentiment(text):
    sentiment_score = sia.polarity_scores(str(text))
    if sentiment_score['compound'] >= 0.05:
        return "Positive"
    elif sentiment_score['compound'] <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# Apply sentiment analysis
df["Sentiment"] = df["Reviews"].apply(get_sentiment)

# Aggregate sentiment per product
product_sentiment = df.groupby("Product Name")["Sentiment"].value_counts().unstack(fill_value=0)
product_sentiment["Total Reviews"] = product_sentiment.sum(axis=1)
product_sentiment["Positive %"] = (product_sentiment.get("Positive", 0) / product_sentiment["Total Reviews"]) * 100
product_sentiment["Negative %"] = (product_sentiment.get("Negative", 0) / product_sentiment["Total Reviews"]) * 100
product_sentiment["Neutral %"] = (product_sentiment.get("Neutral", 0) / product_sentiment["Total Reviews"]) * 100

# Classify overall product sentiment
def classify_overall_sentiment(row):
    if row["Positive %"] > 50:
        return "Liked"
    elif row["Negative %"] > 50:
        return "Not Liked"
    else:
        return "Mixed Opinion"

# Apply classification
product_sentiment["Overall Sentiment"] = product_sentiment.apply(classify_overall_sentiment, axis=1)

# Save results
df.to_csv("Reviews_Sentiment_Analysis.csv", index=False, encoding="utf-8-sig")
product_sentiment.to_csv("Product_Sentiment_Summary.csv", encoding="utf-8-sig")

print("Sentiment analysis completed. Results saved to CSV files.")

# Display summary
def generate_summary():
    """Generates a summary of user reactions based on sentiment analysis."""
    sentiment_counts = df["Sentiment"].value_counts().to_dict()
    total_reviews = len(df)
    summary = f"Out of {total_reviews} reviews:\n"
    for sentiment, count in sentiment_counts.items():
        summary += f"- {sentiment}: {count} reviews ({(count/total_reviews)*100:.2f}%)\n"
    return summary

print(generate_summary())


Sentiment analysis completed. Results saved to CSV files.
Out of 500 reviews:
- Positive: 466 reviews (93.20%)
- Neutral: 25 reviews (5.00%)
- Negative: 9 reviews (1.80%)



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy

# Download necessary NLTK data
nltk.download('vader_lexicon')
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

# Load dataset
df = pd.read_csv("Snapdeal_Product_Reviews.csv")

# Define key aspects to analyze
aspects = ["quality", "price", "delivery", "pack", "performance", "item", "product", "useful", "smooth",
           "material", "design", "service", "fabric", "satisf", "sharper", "weight", "cloth", "look",
         "fit", "comfort", "size", "deal", "worth", "gas", "easy", "colour"]


def extract_aspects(text):
    """Identify key aspects mentioned in the review."""
    found_aspects = [aspect for aspect in aspects if aspect in text.lower()]
    return list(set(found_aspects))  # Ensure unique aspects


def analyze_sentiment(text):
    """Determine sentiment polarity of a review."""
    sentiment_score = sia.polarity_scores(text)['compound']
    return "positive" if sentiment_score > 0.05 else "negative" if sentiment_score < -0.05 else "neutral"

# Apply functions to dataset
df['aspects'] = df['Reviews'].astype(str).apply(extract_aspects)
df['sentiment'] = df['Reviews'].astype(str).apply(analyze_sentiment)

# Convert aspect lists to comma-separated strings to avoid duplication
df['aspects'] = df['aspects'].apply(lambda x: ', '.join(x) if x else 'None')

# Remove duplicate reviews
df = df.drop_duplicates(subset=['Reviews'])

# Save results to CSV
df.to_csv("aspect_based_opinion_mining_results.csv", index=False)

print("Aspect-based opinion mining completed. Results saved to aspect_based_opinion_mining_results.csv")
print("Total unique reviews in dataset:", len(df))
print("Number of reviews with extracted aspects:", (df['aspects'] != 'None').sum())
print("Number of reviews with NO aspects:", (df['aspects'] == 'None').sum())

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Aspect-based opinion mining completed. Results saved to aspect_based_opinion_mining_results.csv
Total unique reviews in dataset: 492
Number of reviews with extracted aspects: 428
Number of reviews with NO aspects: 64


In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy
from collections import Counter

# Download necessary NLTK data
nltk.download('vader_lexicon')
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()

# Load dataset
df = pd.read_csv("Snapdeal_Product_Reviews.csv")
summary_df = pd.read_csv("Product_Sentiment_Summary.csv")

# Define key aspects to analyze
aspects = ["quality", "price", "delivery", "pack", "performance", "item", "useful", "smooth",
           "material", "design", "service", "fabric", "satisf", "sharper", "weight", "cloth", "look",
           "fit", "comfort", "size", "deal", "worth", "gas", "easy", "colour"]


def extract_aspects(text):
    """Identify key aspects mentioned in the review."""
    found_aspects = [aspect for aspect in aspects if aspect in text.lower()]
    return list(set(found_aspects))  # Ensure unique aspects


# Apply aspect extraction
df['aspects'] = df['Reviews'].astype(str).apply(extract_aspects)

# Remove duplicate reviews
df = df.drop_duplicates(subset=['Reviews'])

# Aggregate aspect occurrences per product
product_aspect_data = []
for _, row in df.iterrows():
    for aspect in row['aspects']:
        product_aspect_data.append({'Product Name': row['Product Name'], 'Aspect': aspect})

aspect_df = pd.DataFrame(product_aspect_data)

# Identify best and worst aspects per product
best_worst_aspects = []
for product, group in aspect_df.groupby('Product Name'):
    aspect_counts = Counter(group['Aspect'])
    if not aspect_counts:
        best_worst_aspects.append({'Product Name': product, 'Best Aspect': 'None', 'Worst Aspect': 'None'})
        continue

    overall_sentiment = summary_df.loc[summary_df['Product Name'] == product, 'Overall Sentiment'].values[0]

    if overall_sentiment == "Liked":
        max_count = max(aspect_counts.values())
        best_aspects = [aspect for aspect, count in aspect_counts.items() if count == max_count]
        best_aspect = sorted(best_aspects)[0]  # Pick alphabetically first if tie
        worst_aspect = 'None'
    else:
        max_count = max(aspect_counts.values())
        worst_aspects = [aspect for aspect, count in aspect_counts.items() if count == max_count]
        worst_aspect = sorted(worst_aspects)[0]  # Pick alphabetically first if tie
        best_aspect = 'None'

    best_worst_aspects.append({'Product Name': product, 'Best Aspect': best_aspect, 'Worst Aspect': worst_aspect})

best_worst_df = pd.DataFrame(best_worst_aspects)

# Save results to CSV
best_worst_df.to_csv("best_worst_aspects_per_product.csv", encoding="utf-8-sig", index=False)

print("Aspect-based opinion mining completed. Results saved to CSV files.")
print("Total unique reviews in dataset:", len(df))
print("Number of reviews with extracted aspects:", (df['aspects'].str.len() > 0).sum())
print("Number of reviews with NO aspects:", (df['aspects'].str.len() == 0).sum())


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Aspect-based opinion mining completed. Results saved to CSV files.
Total unique reviews in dataset: 492
Number of reviews with extracted aspects: 306
Number of reviews with NO aspects: 186


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Load your dataset
df = pd.read_csv("Annotated_Product_Reviews.csv")

# Feature Engineering: Adding features like sentiment and aspects
# Assuming sentiment column exists and aspects are already extracted
df['sentiment_label'] = df['Useful_in_India'].map({'Yes': 1, 'No': 0, 'Maybe': 2})  # Sentiment as numerical values

# Create a 'text' feature by combining reviews and aspects (if you need to use both)
df['combined_features'] = df['Reviews']

# Vectorizing text data (Bag-of-Words representation)
vectorizer = CountVectorizer(stop_words='english')

# Prepare features and labels
X = df['combined_features']
y = df['sentiment_label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a pipeline with vectorizer and classifier
pipeline = make_pipeline(
    vectorizer,  # Vectorize the text data
    StandardScaler(with_mean=False),  # To handle sparse matrix from CountVectorizer
    RandomForestClassifier(n_estimators=100, random_state=42)  # Classifier
)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Save the trained model (optional)
import joblib
joblib.dump(pipeline, 'product_usefulness_model.pkl')

print("✅ Model training complete and saved.")


Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

Accuracy: 1.0
✅ Model training complete and saved.
