In [1]:
import spacy
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import re

# Load spaCy model (you'll need to install: python -m spacy download en_core_web_sm)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Please install spaCy English model: python -m spacy download en_core_web_sm")
    exit()

# Sample Amazon product reviews data
sample_reviews = [
    {
        'review_id': 1,
        'product_category': 'Electronics',
        'review_text': "The iPhone 14 Pro from Apple is absolutely amazing! The camera quality is outstanding and the battery life exceeds expectations. Highly recommended!",
        'rating': 5
    },
    {
        'review_id': 2,
        'product_category': 'Electronics',
        'review_text': "Samsung Galaxy S23 Ultra disappointed me. The screen is good but the phone gets too hot during gaming. Not worth the price.",
        'rating': 2
    },
    {
        'review_id': 3,
        'product_category': 'Home & Kitchen',
        'review_text': "KitchenAid Stand Mixer is a game changer! Perfect for baking cookies and cakes. The Ninja blender also works great for smoothies.",
        'rating': 5
    },
    {
        'review_id': 4,
        'product_category': 'Books',
        'review_text': "Harry Potter series by J.K. Rowling never gets old. Amazon Kindle makes reading so convenient. The story is captivating!",
        'rating': 4
    },
    {
        'review_id': 5,
        'product_category': 'Electronics',
        'review_text': "Sony WH-1000XM5 headphones have terrible sound quality. The noise cancellation doesn't work properly. Very disappointing purchase from Best Buy.",
        'rating': 1
    },
    {
        'review_id': 6,
        'product_category': 'Clothing',
        'review_text': "Nike Air Max shoes are comfortable but the quality control is poor. Adidas makes better running shoes in my opinion.",
        'rating': 3
    },
    {
        'review_id': 7,
        'product_category': 'Electronics',
        'review_text': "MacBook Pro M2 from Apple Store is incredibly fast. Perfect for video editing and programming tasks. Worth every penny!",
        'rating': 5
    },
    {
        'review_id': 8,
        'product_category': 'Home & Kitchen',
        'review_text': "Instant Pot pressure cooker broke after 2 months. Cuisinart products are much more reliable. Poor quality control.",
        'rating': 2
    }
]

# Convert to DataFrame
df = pd.DataFrame(sample_reviews)

class AmazonReviewAnalyzer:
    def __init__(self):
        self.nlp = nlp
        self.product_patterns = [
            # Electronics patterns
            r'\b(?:iPhone|iPad|MacBook|Apple Watch|AirPods)\s*(?:\d+|\w+)*\b',
            r'\b(?:Samsung|Galaxy)\s*(?:\w+\s*)*\b',
            r'\b(?:Sony|Nintendo|Microsoft|Dell|HP|Lenovo)\s*(?:\w+\s*)*\b',
            # Brand + Product patterns
            r'\b(?:Nike|Adidas|Puma)\s*(?:\w+\s*)*\b',
            r'\b(?:KitchenAid|Cuisinart|Instant Pot|Ninja)\s*(?:\w+\s*)*\b'
        ]

        # Sentiment lexicon (simplified rule-based approach)
        self.positive_words = {
            'amazing', 'excellent', 'outstanding', 'perfect', 'great', 'good', 'love',
            'fantastic', 'wonderful', 'awesome', 'best', 'brilliant', 'superb', 'recommended',
            'comfortable', 'convenient', 'fast', 'reliable', 'worth', 'captivating'
        }

        self.negative_words = {
            'terrible', 'awful', 'bad', 'poor', 'disappointing', 'disappointed', 'hate',
            'worst', 'horrible', 'useless', 'broken', 'defective', 'overpriced', 'cheap',
            'unreliable', 'slow', 'uncomfortable', 'inconvenient'
        }

        # Intensifiers
        self.intensifiers = {
            'very': 1.5, 'extremely': 2.0, 'absolutely': 1.8, 'really': 1.3,
            'quite': 1.2, 'totally': 1.7, 'completely': 1.8, 'incredibly': 1.9
        }

    def extract_entities(self, text):
        """Extract named entities using spaCy NER"""
        doc = self.nlp(text)
        entities = []

        for ent in doc.ents:
            entities.append({
                'text': ent.text,
                'label': ent.label_,
                'description': spacy.explain(ent.label_),
                'start': ent.start_char,
                'end': ent.end_char
            })

        return entities

    def extract_products_and_brands(self, text):
        """Extract products and brands using pattern matching and NER"""
        products_brands = []

        # Pattern-based extraction
        for pattern in self.product_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                products_brands.append({
                    'text': match.group(),
                    'type': 'Product/Brand',
                    'method': 'Pattern',
                    'start': match.start(),
                    'end': match.end()
                })

        # NER-based extraction
        doc = self.nlp(text)
        for ent in doc.ents:
            if ent.label_ in ['ORG', 'PRODUCT', 'GPE']:  # Organizations, Products, Geopolitical entities
                products_brands.append({
                    'text': ent.text,
                    'type': f'NER-{ent.label_}',
                    'method': 'spaCy NER',
                    'start': ent.start_char,
                    'end': ent.end_char
                })

        # Remove duplicates
        unique_entities = []
        seen = set()
        for entity in products_brands:
            key = (entity['text'].lower(), entity['start'], entity['end'])
            if key not in seen:
                unique_entities.append(entity)
                seen.add(key)

        return unique_entities

    def analyze_sentiment_rule_based(self, text):
        """Rule-based sentiment analysis"""
        # Clean and tokenize
        text_lower = text.lower()
        doc = self.nlp(text_lower)
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

        positive_score = 0
        negative_score = 0

        # Simple word counting with intensifiers
        for i, token in enumerate(tokens):
            # Check for intensifiers
            intensifier = 1.0
            if i > 0 and tokens[i-1] in self.intensifiers:
                intensifier = self.intensifiers[tokens[i-1]]

            if token in self.positive_words:
                positive_score += intensifier
            elif token in self.negative_words:
                negative_score += intensifier

        # Calculate final sentiment
        total_score = positive_score - negative_score

        if total_score > 0.5:
            sentiment = 'Positive'
        elif total_score < -0.5:
            sentiment = 'Negative'
        else:
            sentiment = 'Neutral'

        return {
            'sentiment': sentiment,
            'positive_score': positive_score,
            'negative_score': negative_score,
            'total_score': total_score,
            'confidence': abs(total_score) / max(1, positive_score + negative_score)
        }

    def analyze_review(self, review_text):
        """Complete analysis of a single review"""
        # Extract all entities
        entities = self.extract_entities(review_text)

        # Extract products and brands specifically
        products_brands = self.extract_products_and_brands(review_text)

        # Analyze sentiment
        sentiment_analysis = self.analyze_sentiment_rule_based(review_text)

        # Additional TextBlob sentiment for comparison
        blob = TextBlob(review_text)
        textblob_sentiment = {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity,
            'sentiment': 'Positive' if blob.sentiment.polarity > 0.1 else 'Negative' if blob.sentiment.polarity < -0.1 else 'Neutral'
        }

        return {
            'entities': entities,
            'products_brands': products_brands,
            'rule_based_sentiment': sentiment_analysis,
            'textblob_sentiment': textblob_sentiment
        }

    def analyze_dataset(self, df):
        """Analyze entire dataset"""
        results = []

        for idx, row in df.iterrows():
            analysis = self.analyze_review(row['review_text'])
            analysis['review_id'] = row['review_id']
            analysis['actual_rating'] = row['rating']
            analysis['product_category'] = row['product_category']
            results.append(analysis)

        return results

# Initialize analyzer
analyzer = AmazonReviewAnalyzer()

# Analyze all reviews
print("=" * 60)
print("AMAZON REVIEWS NLP ANALYSIS WITH spaCy")
print("=" * 60)

results = analyzer.analyze_dataset(df)

# Display results for each review
for i, result in enumerate(results):
    print(f"\n{'='*50}")
    print(f"REVIEW {result['review_id']} - {result['product_category']}")
    print(f"Rating: {result['actual_rating']}/5")
    print(f"{'='*50}")

    # Original text
    original_text = df[df['review_id'] == result['review_id']]['review_text'].iloc[0]
    print(f"Text: {original_text}\n")

    # Products and Brands
    print("EXTRACTED PRODUCTS & BRANDS:")
    if result['products_brands']:
        for entity in result['products_brands']:
            print(f"  • {entity['text']} ({entity['type']}) - Method: {entity['method']}")
    else:
        print("  No products/brands detected")

    # All Named Entities
    print("\nALL NAMED ENTITIES:")
    if result['entities']:
        for entity in result['entities']:
            print(f"  • {entity['text']} → {entity['label']} ({entity['description']})")
    else:
        print("  No named entities detected")

    # Sentiment Analysis
    print(f"\nSENTIMENT ANALYSIS:")
    rule_sentiment = result['rule_based_sentiment']
    textblob_sentiment = result['textblob_sentiment']

    print(f"Rule-based: {rule_sentiment['sentiment']} (Score: {rule_sentiment['total_score']:.2f})")
    print(f"  - Positive score: {rule_sentiment['positive_score']:.2f}")
    print(f"  - Negative score: {rule_sentiment['negative_score']:.2f}")
    print(f"  - Confidence: {rule_sentiment['confidence']:.2f}")

    print(f"TextBlob: {textblob_sentiment['sentiment']} (Polarity: {textblob_sentiment['polarity']:.2f})")

# Summary Statistics
print(f"\n{'='*60}")
print("SUMMARY STATISTICS")
print(f"{'='*60}")

# Extract all products/brands
all_products = []
sentiment_distribution = {'Positive': 0, 'Negative': 0, 'Neutral': 0}
category_sentiment = {}

for result in results:
    # Collect products
    for entity in result['products_brands']:
        all_products.append(entity['text'])

    # Count sentiments
    sentiment = result['rule_based_sentiment']['sentiment']
    sentiment_distribution[sentiment] += 1

    # Category sentiment
    category = result['product_category']
    if category not in category_sentiment:
        category_sentiment[category] = {'Positive': 0, 'Negative': 0, 'Neutral': 0}
    category_sentiment[category][sentiment] += 1

# Most mentioned products
product_counts = Counter(all_products)
print(f"\nMOST MENTIONED PRODUCTS/BRANDS:")
for product, count in product_counts.most_common(5):
    print(f"  {product}: {count} mentions")

print(f"\nSENTIMENT DISTRIBUTION:")
for sentiment, count in sentiment_distribution.items():
    percentage = (count / len(results)) * 100
    print(f"  {sentiment}: {count} reviews ({percentage:.1f}%)")

print(f"\nSENTIMENT BY CATEGORY:")
for category, sentiments in category_sentiment.items():
    print(f"  {category}:")
    total = sum(sentiments.values())
    for sentiment, count in sentiments.items():
        if count > 0:
            percentage = (count / total) * 100
            print(f"    {sentiment}: {count} ({percentage:.1f}%)")

# Accuracy comparison with ratings
print(f"\nACCURACY ANALYSIS:")
print("Comparing sentiment predictions with actual ratings...")

correct_predictions = 0
total_predictions = len(results)

for result in results:
    actual_rating = result['actual_rating']
    predicted_sentiment = result['rule_based_sentiment']['sentiment']

    # Convert rating to sentiment
    if actual_rating >= 4:
        actual_sentiment = 'Positive'
    elif actual_rating <= 2:
        actual_sentiment = 'Negative'
    else:
        actual_sentiment = 'Neutral'

    if predicted_sentiment == actual_sentiment:
        correct_predictions += 1

accuracy = (correct_predictions / total_predictions) * 100
print(f"Rule-based sentiment accuracy: {accuracy:.1f}% ({correct_predictions}/{total_predictions})")

print(f"\n{'='*60}")
print("ANALYSIS COMPLETE")
print(f"{'='*60}")

AMAZON REVIEWS NLP ANALYSIS WITH spaCy

REVIEW 1 - Electronics
Rating: 5/5
Text: The iPhone 14 Pro from Apple is absolutely amazing! The camera quality is outstanding and the battery life exceeds expectations. Highly recommended!

EXTRACTED PRODUCTS & BRANDS:
  • iPhone 14 (Product/Brand) - Method: Pattern
  • Apple (NER-ORG) - Method: spaCy NER

ALL NAMED ENTITIES:
  • 14 → CARDINAL (Numerals that do not fall under another type)
  • Apple → ORG (Companies, agencies, institutions, etc.)

SENTIMENT ANALYSIS:
Rule-based: Positive (Score: 2.80)
  - Positive score: 2.80
  - Negative score: 0.00
  - Confidence: 1.00
TextBlob: Positive (Polarity: 0.48)

REVIEW 2 - Electronics
Rating: 2/5
Text: Samsung Galaxy S23 Ultra disappointed me. The screen is good but the phone gets too hot during gaming. Not worth the price.

EXTRACTED PRODUCTS & BRANDS:
  • Samsung Galaxy S23 Ultra disappointed me (Product/Brand) - Method: Pattern
  • Samsung (NER-ORG) - Method: spaCy NER
  • S23 Ultra (NER-PRODUCT) 