In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains

def handle_privacy_notice(driver, max_wait=10):
    """
    Handle the privacy notice popup by finding and clicking the close button
    Returns True if successfully closed, False otherwise
    """
    try:
        # Wait for privacy notice to appear and become clickable
        wait = WebDriverWait(driver, max_wait)
        close_button = wait.until(
            EC.element_to_be_clickable((By.ID, "onetrust-close-btn-container"))
        )
        
        # Trying different click methods
        try:
            close_button.click()
        except:
            try:
                ActionChains(driver).move_to_element(close_button).click().perform()
            except:
                driver.execute_script("arguments[0].click();", close_button)
        
        # Wait for popup to disappear
        time.sleep(1)
        return True
    except Exception as e:
        print(f"Error handling privacy notice: {e}")
        return False

def fetch_and_save_html(url, output_file="bestbuy_reviews.html", max_attempts=20):
    """
    Use Selenium to load the page, handle privacy notice, click 'Show More', and save the final HTML
    """
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Enable headless mode
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--window-size=1920,1080')

    driver = webdriver.Chrome(options=chrome_options)
    action = ActionChains(driver)
    
    try:
        print(f"Opening URL: {url}")
        driver.get(url)
        time.sleep(3)  # Initial load wait
        
        # Handle privacy notice
        if not handle_privacy_notice(driver):
            print("Warning: Could not handle privacy notice")
        
        attempts = 0
        while attempts < max_attempts:
            try:
                # Wait for Show More button to be present and visible
                wait = WebDriverWait(driver, 10)
                show_more = wait.until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR, "div[data-automation='load-more-button'] a.loadMoreLink_2cY6X")
                    )
                )
                
                # Scroll into view
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", show_more)
                time.sleep(1)
                
                # Click using different methods until one works
                try:
                    show_more.click()
                except:
                    try:
                        action.move_to_element(show_more).click().perform()
                    except:
                        driver.execute_script("arguments[0].click();", show_more)
                
                print(f"Clicked Show More button (attempt {attempts + 1})")
                time.sleep(2)  # Wait for new content to load
                attempts += 1
                
            except NoSuchElementException:
                print("No more Show More button found")
                break
            except Exception as e:
                print(f"Error clicking button: {e}")
                break
        
        # Wait longer before saving HTML to ensure all content is loaded
        print("Waiting for final content to load completely...")
        time.sleep(10)
        
        # Save the final HTML
        print("Saving final HTML...")
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(driver.page_source)
        
        print(f"HTML saved to {output_file}")
        
    finally:
        driver.quit()

In [2]:

url = "https://www.bestbuy.ca/en-ca/product/google-pixel-9-pro-256gb-hazel-unlocked/18165489/review"

try:
    fetch_and_save_html(url)
except Exception as e:
    print(f"An error occurred: {e}")

Opening URL: https://www.bestbuy.ca/en-ca/product/google-pixel-9-pro-256gb-hazel-unlocked/18165489/review
Clicked Show More button (attempt 1)
Clicked Show More button (attempt 2)
Clicked Show More button (attempt 3)
Clicked Show More button (attempt 4)
Clicked Show More button (attempt 5)
Error clicking button: Message: 
Stacktrace:
#0 0x5752cf3dcc5a <unknown>
#1 0x5752cf0bfe2c <unknown>
#2 0x5752cf10c661 <unknown>
#3 0x5752cf10c751 <unknown>
#4 0x5752cf150f64 <unknown>
#5 0x5752cf12f5ed <unknown>
#6 0x5752cf14e303 <unknown>
#7 0x5752cf12f363 <unknown>
#8 0x5752cf0ff247 <unknown>
#9 0x5752cf0ffb9e <unknown>
#10 0x5752cf3a322b <unknown>
#11 0x5752cf3a72d1 <unknown>
#12 0x5752cf38eade <unknown>
#13 0x5752cf3a7e32 <unknown>
#14 0x5752cf37377f <unknown>
#15 0x5752cf3cc618 <unknown>
#16 0x5752cf3cc7f0 <unknown>
#17 0x5752cf3dbd8c <unknown>
#18 0x75bfc2894ac3 <unknown>

Waiting for final content to load completely...
Saving final HTML...
HTML saved to bestbuy_reviews.html


In [3]:
from bs4 import BeautifulSoup
import csv
import re
from datetime import datetime

def find_element_by_partial_class(container, element_type, partial_class):
    """
    Helper function to find elements with class names starting with the given prefix
    """
    elements = container.find_all(element_type, class_=lambda x: x and x.startswith(partial_class))
    return elements[0] if elements else None

def parse_reviews_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    reviews_data = []
    
    # Find all review containers - looking for classes starting with 'reviewContent'
    reviews = soup.find_all(lambda tag: tag.get('class') and 
                          any(cls.startswith('reviewContent') for cls in tag.get('class')))
    
    for review in reviews:
        # Get the parent container that holds all review information
        review_container = review.parent
        
        # Extract review title - finding class starting with 'reviewTitle'
        title = find_element_by_partial_class(review_container, 'div', 'reviewTitle')
        title_text = title.get_text(strip=True) if title else ''
        
        # Extract reviewer info - finding class starting with 'reviewerInfo'
        reviewer_info = find_element_by_partial_class(review_container, 'div', 'reviewerInfo')
        if reviewer_info:
            # Find author - class starting with 'author'
            author = find_element_by_partial_class(reviewer_info, 'span', 'author')
            author_name = author.find_all('span')[-1].get_text(strip=True) if author else ''
            
            # Find date - class starting with 'locationAndTime'
            date_span = find_element_by_partial_class(reviewer_info, 'span', 'locationAndTime')
            date_str = date_span.get('data-date') if date_span else ''
            formatted_date = datetime.fromisoformat(date_str).strftime('%Y-%m-%d') if date_str else ''
        else:
            author_name = ''
            formatted_date = ''
        
        # Extract review content
        review_text = review.get_text(strip=True)
        
        # Extract syndication source - class starting with 'syndicationSource'
        syndication = find_element_by_partial_class(review_container, 'p', 'syndicationSource')
        source = syndication.get_text(strip=True) if syndication else ''
        
        # Check if it's a promotional review
        is_promotional = '[This review was collected as part of a promotion.]' in review_text
        clean_review = review_text.replace('[This review was collected as part of a promotion.]', '').strip()
        
        reviews_data.append({
            'title': title_text,
            'author': author_name,
            'date': formatted_date,
            'review_text': clean_review,
            'is_promotional': is_promotional,
            'source': source
        })
    
    return reviews_data

def save_to_csv(reviews_data, output_file='reviews.csv'):
    if not reviews_data:
        print("No reviews found!")
        return
        
    fieldnames = ['title', 'author', 'date', 'review_text', 'is_promotional', 'source']
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(reviews_data)
        print(f"Successfully saved {len(reviews_data)} reviews to {output_file}")

In [4]:
html_file = "/home/sravanth/Documents/axion_ray/bestbuy_reviews.html"
with open(html_file, 'r', encoding='utf-8') as file:
    html_content = file.read()

reviews_data = parse_reviews_html(html_content)
print(f"Found {len(reviews_data)} reviews")
save_to_csv(reviews_data)

Found 58 reviews
Successfully saved 58 reviews to reviews.csv


In [5]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Download required NLTK data
nltk.download('punkt_tab')
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

def analyze_reviews(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Initialize the NLTK sentiment analyzer
    sia = SentimentIntensityAnalyzer()
    
    # Function to get sentiment scores
    def get_sentiment(text):
        return sia.polarity_scores(text)
    
    # Calculate sentiment scores for each review
    sentiments = df['review_text'].apply(get_sentiment)
    
    # Extract sentiment scores into separate columns
    df['compound'] = sentiments.apply(lambda x: x['compound'])
    df['positive'] = sentiments.apply(lambda x: x['pos'])
    df['negative'] = sentiments.apply(lambda x: x['neg'])
    df['neutral'] = sentiments.apply(lambda x: x['neu'])
    
    # Classify overall sentiment
    df['sentiment'] = df['compound'].apply(lambda x: 'Positive' if x > 0.05 
                                         else ('Negative' if x < -0.05 else 'Neutral'))
    
    # Get most common words (excluding stopwords)
    stop_words = set(stopwords.words('english'))
    all_words = ' '.join(df['review_text']).lower()
    word_tokens = word_tokenize(all_words)
    filtered_words = [word for word in word_tokens 
                     if word.isalnum() and word not in stop_words]
    word_freq = Counter(filtered_words).most_common(10)
    
    # Calculate average sentiment scores
    avg_scores = {
        'Average Compound Score': df['compound'].mean(),
        'Average Positive Score': df['positive'].mean(),
        'Average Negative Score': df['negative'].mean(),
        'Average Neutral Score': df['neutral'].mean()
    }
    
    # Calculate sentiment distribution
    sentiment_dist = df['sentiment'].value_counts()
    
    # Generate summary statistics
    summary = {
        'Total Reviews': len(df),
        'Average Sentiment Score': df['compound'].mean(),
        'Positive Reviews': sentiment_dist.get('Positive', 0),
        'Neutral Reviews': sentiment_dist.get('Neutral', 0),
        'Negative Reviews': sentiment_dist.get('Negative', 0),
        'Most Common Words': dict(word_freq),
        'Average Scores': avg_scores
    }
    
    return df, summary

def plot_sentiment_distribution(df):
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='compound', bins=20)
    plt.title('Distribution of Sentiment Scores')
    plt.xlabel('Compound Sentiment Score')
    plt.ylabel('Count')
    plt.savefig('sentiment_distribution.png')
    plt.close()

def plot_sentiment_categories(df):
    plt.figure(figsize=(8, 6))
    df['sentiment'].value_counts().plot(kind='bar')
    plt.title('Distribution of Sentiment Categories')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.savefig('sentiment_categories.png')
    plt.close()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/sravanth/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/sravanth/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/sravanth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sravanth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:

# Analyze reviews
df, summary = analyze_reviews('/home/sravanth/Documents/axion_ray/reviews.csv')

# Print summary
print("\n=== Sentiment Analysis Summary ===")
print(f"Total Reviews Analyzed: {summary['Total Reviews']}")
print(f"Average Sentiment Score: {summary['Average Sentiment Score']:.3f}")
print("\nSentiment Distribution:")
print(f"Positive Reviews: {summary['Positive Reviews']}")
print(f"Neutral Reviews: {summary['Neutral Reviews']}")
print(f"Negative Reviews: {summary['Negative Reviews']}")

print("\nAverage Scores:")
for metric, score in summary['Average Scores'].items():
    print(f"{metric}: {score:.3f}")

print("\nTop 10 Most Common Words:")
for word, count in summary['Most Common Words'].items():
    print(f"{word}: {count}")

# Generate plots
plot_sentiment_distribution(df)
plot_sentiment_categories(df)

# Export detailed results to CSV
df[['review_text', 'compound', 'positive', 'negative', 'neutral', 'sentiment']].to_csv(
    'sentiment_analysis_results.csv', index=False)


=== Sentiment Analysis Summary ===
Total Reviews Analyzed: 58
Average Sentiment Score: 0.929

Sentiment Distribution:
Positive Reviews: 58
Neutral Reviews: 0
Negative Reviews: 0

Average Scores:
Average Compound Score: 0.929
Average Positive Score: 0.276
Average Negative Score: 0.019
Average Neutral Score: 0.704

Top 10 Most Common Words:
phone: 121
pixel: 73
camera: 58
pro: 55
google: 50
9: 48
battery: 44
great: 37
features: 35
life: 31
