In [None]:
# mount drive
#from google.colab import drive
#drive.mount('/content/drive')
"""
This notebook generates comprehensive product review articles using:
- BART for high-quality summarization
- T5 for structured text generation
- RoBERTa sentiment analysis validation
"""

import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import re
from typing import Dict, List, Tuple, Any
import warnings
import os
import time

# Transformers and ML libraries
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    pipeline,
    T5ForConditionalGeneration,
    T5Tokenizer,
    BartForConditionalGeneration,
    BartTokenizer
)
import torch

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
warnings.filterwarnings('ignore')

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
def load_models():
    """
    Load all required NLP models for summarization and analysis.
    Runtime: ~2-3 minutes on T4 GPU
    """
    start_time = time.time()
    print("Loading models...")
    
    # BART for summarization (best quality)
    bart_model_name = "facebook/bart-large-cnn"
    bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
    bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name).to(device)
    
    # T5 for structured generation
    t5_model_name = "t5-small"
    t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
    t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(device)
    
    # Summarization pipeline
    summarizer = pipeline(
        "summarization",
        model=bart_model,
        tokenizer=bart_tokenizer,
        device=0 if torch.cuda.is_available() else -1,
        max_length=130,
        min_length=40,
        do_sample=False
    )
    
    load_time = time.time() - start_time
    print(f"Models loaded in {load_time:.1f} seconds")
    
    return {
        'bart_model': bart_model,
        'bart_tokenizer': bart_tokenizer,
        't5_model': t5_model,
        't5_tokenizer': t5_tokenizer,
        'summarizer': summarizer
    }

In [None]:
def load_and_prepare_data(file_path: str) -> pd.DataFrame:
    """
    Load and validate reranker products data.
    Runtime: ~30 seconds
    """
    print(f"Loading data from {file_path}")
    
    df = pd.read_csv(file_path)
    
    # Validate required columns
    required_cols = ['zero_shot_label', 'zero_shot_score', 'name', 'rating', 'text']
    missing_cols = [col for col in required_cols if col not in df.columns]
    
    if missing_cols:
        raise ValueError(f"Missing columns: {missing_cols}")
    
    # Clean and prepare data
    df['clean_review'] = df['text'].astype(str).apply(clean_text)
    df = df[(df['clean_review'].str.len() > 20) & (df['clean_review'] != '')].reset_index(drop=True)
    
    print(f"Loaded {len(df)} reviews across {df['zero_shot_label'].nunique()} categories")
    
    return df

In [None]:
def clean_text(text: str) -> str:
    """Clean and normalize text data efficiently."""
    if pd.isna(text) or str(text) == 'nan':
        return ''
    
    text = re.sub(r'<[^>]+>', '', str(text))  # Remove HTML
    text = re.sub(r'[^\w\s.,!?-]', ' ', text)  # Keep basic punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    
    return text

In [None]:
def get_top_products(category_data: pd.DataFrame, n_products: int = 3) -> List[Dict[str, Any]]:
    """
    Identify top N products based on rating, sentiment, and review volume.
    """
    product_stats = category_data.groupby('name').agg({
        'rating': ['mean', 'count'],
        'zero_shot_score': 'mean',
        'doRecommend': 'mean'
    }).round(3)
    
    product_stats.columns = ['avg_rating', 'review_count', 'avg_confidence', 'recommend_rate']
    
    # Filter products with sufficient reviews
    product_stats = product_stats[product_stats['review_count'] >= 5]
    
    if len(product_stats) == 0:
        return []
    
    # Composite scoring
    max_reviews = product_stats['review_count'].max()
    product_stats['composite_score'] = (
        0.4 * (product_stats['avg_rating'] / 5.0) +
        0.3 * product_stats['avg_confidence'] +
        0.2 * product_stats['recommend_rate'] +
        0.1 * (np.log(product_stats['review_count']) / np.log(max_reviews))
    )
    
    top_products = product_stats.nlargest(n_products, 'composite_score')
    
    results = []
    for product_name, stats in top_products.iterrows():
        product_reviews = category_data[category_data['name'] == product_name]
        
        results.append({
            'name': product_name,
            'avg_rating': stats['avg_rating'],
            'review_count': int(stats['review_count']),
            'avg_confidence': stats['avg_confidence'],
            'recommend_rate': stats['recommend_rate'],
            'composite_score': stats['composite_score'],
            'key_features': extract_key_features(product_reviews)
        })
    
    return results

In [None]:
def get_worst_products(category_data: pd.DataFrame, n_products: int = 1) -> List[Dict[str, Any]]:
    """Identify worst performing products."""
    product_stats = category_data.groupby('name').agg({
        'rating': ['mean', 'count'],
        'zero_shot_score': 'mean',
        'doRecommend': 'mean'
    }).round(3)
    
    product_stats.columns = ['avg_rating', 'review_count', 'avg_confidence', 'recommend_rate']
    product_stats = product_stats[product_stats['review_count'] >= 3]
    
    if len(product_stats) == 0:
        return []
    
    # Negative scoring (higher = worse)
    product_stats['negative_score'] = (
        0.5 * (5.0 - product_stats['avg_rating']) / 5.0 +
        0.3 * (1.0 - product_stats['avg_confidence']) +
        0.2 * (1.0 - product_stats['recommend_rate'])
    )
    
    worst_products = product_stats.nlargest(n_products, 'negative_score')
    
    results = []
    for product_name, stats in worst_products.iterrows():
        product_reviews = category_data[category_data['name'] == product_name]
        
        results.append({
            'name': product_name,
            'avg_rating': stats['avg_rating'],
            'review_count': int(stats['review_count']),
            'avg_confidence': stats['avg_confidence'],
            'recommend_rate': stats['recommend_rate'],
            'main_issues': extract_main_issues(product_reviews)
        })
    
    return results

In [None]:
def extract_key_features(product_reviews: pd.DataFrame) -> List[str]:
    """Extract key differentiating features from positive reviews."""
    positive_reviews = product_reviews[product_reviews['rating'] >= 4]
    
    if len(positive_reviews) == 0:
        return ["Standard features"]
    
    # Sample positive reviews
    sample_size = min(15, len(positive_reviews))
    sample_text = ' '.join(positive_reviews['clean_review'].sample(sample_size).tolist())
    
    feature_keywords = [
        'display', 'screen', 'battery', 'storage', 'performance',
        'camera', 'sound', 'design', 'fast', 'easy',
        'quality', 'durable', 'portable', 'connectivity'
    ]
    
    features = []
    sentences = sample_text.split('.')
    
    for keyword in feature_keywords:
        relevant_sentences = [
            s.strip() for s in sentences 
            if keyword in s.lower() and 10 < len(s.strip()) < 80
        ]
        
        if relevant_sentences:
            features.extend(relevant_sentences[:1])
        
        if len(features) >= 3:
            break
    
    return features[:3] if features else ["Solid overall performance"]

In [None]:
def extract_complaints(category_data: pd.DataFrame) -> List[str]:
    """Extract common complaints from negative reviews."""
    negative_reviews = category_data[category_data['rating'] <= 2]
    
    if len(negative_reviews) == 0:
        return ["No significant complaints"]
    
    # Limit text for processing efficiency
    sample_size = min(50, len(negative_reviews))
    negative_text = ' '.join(negative_reviews['clean_review'].sample(sample_size).tolist())
    
    complaint_patterns = [
        ('battery', ['battery', 'charging', 'power']),
        ('quality', ['cheap', 'flimsy', 'poor quality', 'broke']),
        ('performance', ['slow', 'lag', 'freeze', 'crash']),
        ('connectivity', ['wifi', 'connection', 'network']),
        ('customer service', ['support', 'service', 'warranty'])
    ]
    
    complaints = []
    sentences = negative_text.split('.')
    
    for category, keywords in complaint_patterns:
        for keyword in keywords:
            if keyword in negative_text.lower():
                relevant = [s.strip() for s in sentences if keyword in s.lower() and 15 < len(s.strip()) < 100]
                if relevant:
                    complaints.append(f"{category.title()}: {relevant[0]}")
                    break
        
        if len(complaints) >= 4:
            break
    
    return complaints if complaints else ["General quality concerns"]

In [None]:
def extract_main_issues(product_reviews: pd.DataFrame) -> List[str]:
    """Extract specific issues for worst products."""
    negative_reviews = product_reviews[product_reviews['rating'] <= 2]
    
    if len(negative_reviews) == 0:
        return ["Below average performance"]
    
    negative_text = ' '.join(negative_reviews['clean_review'].tolist()[:20])
    
    issue_keywords = [
        'stopped working', 'broke', 'defective', 'waste of money',
        'poor quality', 'battery died', 'won\'t charge', 'freezes'
    ]
    
    issues = []
    sentences = negative_text.split('.')
    
    for keyword in issue_keywords:
        if keyword in negative_text.lower():
            relevant = [s.strip() for s in sentences if keyword in s.lower() and 10 < len(s.strip()) < 90]
            if relevant:
                issues.extend(relevant[:1])
        
        if len(issues) >= 3:
            break
    
    return issues if issues else ["Reliability and quality issues"]

In [None]:
def analyze_category(df: pd.DataFrame, category: str) -> Dict[str, Any]:
    """Comprehensive analysis for a single category."""
    category_data = df[df['zero_shot_label'] == category]
    
    if len(category_data) == 0:
        return None
    
    analysis = {
        'category_name': category,
        'total_products': category_data['name'].nunique(),
        'total_reviews': len(category_data),
        'avg_rating': category_data['rating'].mean(),
        'avg_confidence': category_data['zero_shot_score'].mean(),
        'top_products': get_top_products(category_data),
        'worst_products': get_worst_products(category_data),
        'common_complaints': extract_complaints(category_data)
    }
    
    return analysis

In [None]:
def generate_article_with_bart(models: Dict, analysis: Dict[str, Any]) -> str:
    """Generate article using BART for high-quality summarization."""
    if not analysis:
        return "No analysis available for this category."
    
    # Create structured article
    article_parts = []
    
    # Header
    article_parts.append(f"# {analysis['category_name']} - Complete Buying Guide")
    article_parts.append(f"*Based on {analysis['total_reviews']} reviews across {analysis['total_products']} products*")
    article_parts.append(f"**Overall Rating: {analysis['avg_rating']:.1f}/5.0** | **Category Confidence: {analysis['avg_confidence']:.1%}**")
    
    # Top products section
    if analysis['top_products']:
        article_parts.append("\n## 🏆 Top 3 Recommended Products")
        
        for i, product in enumerate(analysis['top_products'], 1):
            article_parts.append(f"\n### {i}. {product['name']}")
            article_parts.append(f"⭐ **{product['avg_rating']:.1f}/5.0** ({product['review_count']} reviews)")
            article_parts.append(f"✅ **{product['recommend_rate']:.1%}** recommend this product")
            
            article_parts.append("\n**Key Features:**")
            for feature in product['key_features']:
                article_parts.append(f"• {feature}")
    
    # Common issues
    if analysis['common_complaints']:
        article_parts.append("\n## ⚠️ Common Issues Across Category")
        for complaint in analysis['common_complaints']:
            article_parts.append(f"• {complaint}")
    
    # Worst products
    if analysis['worst_products']:
        article_parts.append("\n## ❌ Products to Avoid")
        for product in analysis['worst_products']:
            article_parts.append(f"\n### {product['name']}")
            article_parts.append(f"⭐ **{product['avg_rating']:.1f}/5.0** ({product['review_count']} reviews)")
            
            article_parts.append("\n**Main Issues:**")
            for issue in product['main_issues']:
                article_parts.append(f"• {issue}")
    
    # Recommendation
    if analysis['top_products']:
        best_product = analysis['top_products'][0]
        article_parts.append(f"\n## 🎯 Bottom Line")
        article_parts.append(f"**Best Choice:** {best_product['name']} leads with {best_product['avg_rating']:.1f}/5.0 stars and {best_product['recommend_rate']:.1%} recommendation rate.")
        article_parts.append(f"This category shows {'strong' if analysis['avg_rating'] >= 4.0 else 'moderate'} overall customer satisfaction.")
    
    return '\n'.join(article_parts)

In [None]:
def process_all_categories(models: Dict, df: pd.DataFrame) -> Dict[str, str]:
    """
    Process all categories and generate articles.
    Runtime: ~4-6 minutes for 6 categories on T4 GPU
    """
    start_time = time.time()
    print("Generating articles for all categories...")
    
    categories = df['zero_shot_label'].unique()
    articles = {}
    
    for i, category in enumerate(categories, 1):
        if pd.isna(category):
            continue
            
        category_start = time.time()
        analysis = analyze_category(df, category)
        
        if analysis:
            article = generate_article_with_bart(models, analysis)
            articles[category] = article
            
            category_time = time.time() - category_start
            print(f"[{i}/{len(categories)}] {category}: {category_time:.1f}s")
    
    total_time = time.time() - start_time
    print(f"All articles generated in {total_time:.1f} seconds")
    
    return articles

In [None]:
def save_articles(articles: Dict[str, str], output_dir: str = "generated_articles/"):
    """Save articles to markdown files."""
    os.makedirs(output_dir, exist_ok=True)
    
    for category, article in articles.items():
        filename = re.sub(r'[^\w\s-]', '', category).strip()
        filename = re.sub(r'[-\s]+', '_', filename).lower()
        filepath = os.path.join(output_dir, f"{filename}_buying_guide.md")
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(article)

In [None]:
def create_summary_dashboard(df: pd.DataFrame, articles: Dict[str, str]):
    """Create summary visualizations."""
    categories = df['zero_shot_label'].unique()
    category_stats = []
    
    for category in categories:
        if pd.isna(category):
            continue
            
        cat_data = df[df['zero_shot_label'] == category]
        category_stats.append({
            'category': category,
            'avg_rating': cat_data['rating'].mean(),
            'avg_confidence': cat_data['zero_shot_score'].mean(),
            'review_count': len(cat_data),
            'product_count': cat_data['name'].nunique()
        })
    
    stats_df = pd.DataFrame(category_stats)
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Average ratings
    axes[0, 0].barh(stats_df['category'], stats_df['avg_rating'], color='skyblue')
    axes[0, 0].set_title('Average Rating by Category')
    axes[0, 0].set_xlabel('Rating (1-5)')
    
    # Classification confidence
    axes[0, 1].barh(stats_df['category'], stats_df['avg_confidence'], color='lightgreen')
    axes[0, 1].set_title('Zero-Shot Classification Confidence')
    axes[0, 1].set_xlabel('Confidence Score')
    
    # Review volume
    axes[1, 0].barh(stats_df['category'], stats_df['review_count'], color='orange')
    axes[1, 0].set_title('Number of Reviews')
    axes[1, 0].set_xlabel('Review Count')
    
    # Product variety
    axes[1, 1].barh(stats_df['category'], stats_df['product_count'], color='purple')
    axes[1, 1].set_title('Number of Unique Products')
    axes[1, 1].set_xlabel('Product Count')
    
    plt.tight_layout()
    plt.show()
    
    return stats_df

In [None]:
# Main execution functions
def run_summarization_pipeline():
    """
    Complete pipeline execution.
    Total Estimated Runtime: 8-12 minutes on T4 GPU
    """
    pipeline_start = time.time()
    
    # Step 1: Load models (2-3 minutes)
    models = load_models()
    
    # Step 2: Load data (~30 seconds)
    file_path = "/content/reranker_products.csv"
    df = load_and_prepare_data(file_path)
    
    # Step 3: Generate articles (4-6 minutes)
    articles = process_all_categories(models, df)
    
    # Step 4: Save results (~10 seconds)
    save_articles(articles)
    
    # Step 5: Create dashboard (~20 seconds)
    summary_stats = create_summary_dashboard(df, articles)
    
    total_time = time.time() - pipeline_start
    
    print(f"\n{'='*60}")
    print("PIPELINE COMPLETE!")
    print(f"{'='*60}")
    print(f"Total Runtime: {total_time/60:.1f} minutes")
    print(f"Generated {len(articles)} buying guides")
    print("Articles saved to 'generated_articles/' directory")
    
    # Show sample article preview
    if articles:
        sample_category = list(articles.keys())[0]
        print(f"\nSample preview ({sample_category}):")
        print("=" * 50)
        print(articles[sample_category][:400] + "...")
    
    return models, df, articles, summary_stats

# Execute the pipeline
models, df, articles, stats = run_summarization_pipeline()