In [3]:
!pip install cohere

Collecting cohere
  Downloading cohere-5.11.0-py3-none-any.whl.metadata (3.4 kB)
Collecting boto3<2.0.0,>=1.34.0 (from cohere)
  Downloading boto3-1.35.35-py3-none-any.whl.metadata (6.6 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting parameterized<0.10.0,>=0.9.0 (from cohere)
  Downloading parameterized-0.9.0-py2.py3-none-any.whl.metadata (18 kB)
Collecting sagemaker<3.0.0,>=2.232.1 (from cohere)
  Downloading sagemaker-2.232.2-py3-none-any.whl.metadata (16 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.0.20240914-py3-none-any.whl.metadata (1.9 kB)
Collecting botocore<1.36.0,>=1.35.35 (from boto3<2.0.0,>=1.34.0->cohere)
  Downloading botocore-1.35.35-py3-none-any.whl.metadata (5.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3<2.0.0,>=1.34.0->cohere)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Collecting cloudpickle==2.2.1 (from sagemaker<

In [8]:
import cohere
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string
import time
from typing import List, Dict, Tuple
import os
from collections import defaultdict

class LLMReviewGenerator:
    def __init__(self, cohere_api_key: str, products_df: pd.DataFrame, reviews_df: pd.DataFrame):
        self.co = cohere.Client(cohere_api_key)
        
        # Extract patterns from existing data
        self.category_chains = self._extract_category_chains(products_df)
        self.review_patterns = self._analyze_review_patterns(reviews_df)
        
        self.reviews_df = reviews_df
        
        print(f"Extracted {len(self.category_chains)} unique category chains")
        print(f"Analyzed review patterns across {len(reviews_df)} reviews")
        
    def _extract_category_chains(self, df: pd.DataFrame) -> List[Dict[str, str]]:
        """Extract all unique category chains from the products dataframe"""
        category_columns = ['categories', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6']
        
        # Get unique category combinations
        unique_categories = df[category_columns].drop_duplicates()
        
        # Convert to list of dictionaries
        category_chains = unique_categories.to_dict('records')
        
        # Remove any None or NaN values
        cleaned_chains = []
        for chain in category_chains:
            cleaned_chain = {
                k: str(v) for k, v in chain.items() 
                if v is not None and pd.notna(v) and str(v).lower() != 'nan'
            }
            if cleaned_chain:
                cleaned_chains.append(cleaned_chain)
        
        return cleaned_chains

    def _analyze_review_patterns(self, reviews_df: pd.DataFrame) -> Dict:
        """Analyze patterns in existing reviews to inform generation"""
        patterns = {
            'rating_distribution': reviews_df['rating'].value_counts(normalize=True).to_dict(),
            'avg_helpful_votes': reviews_df['helpful_vote'].mean(),
            'verified_purchase_ratio': reviews_df['verified_purchase'].mean(),
            'review_length_stats': {
                'title_length': {
                    'mean': reviews_df['title'].str.len().mean(),
                    'std': reviews_df['title'].str.len().std()
                },
                'text_length': {
                    'mean': reviews_df['text'].str.len().mean(),
                    'std': reviews_df['text'].str.len().std()
                }
            },
            'temporal_patterns': {
                'hour_distribution': reviews_df['time'].dt.hour.value_counts(normalize=True).to_dict(),
                'weekday_distribution': reviews_df['date'].dt.dayofweek.value_counts(normalize=True).to_dict()
            },
            'reviews_per_product': reviews_df.groupby('parent_asin').size().agg(['mean', 'std']).to_dict()
        }
        
        return patterns

    def generate_product_prompt(self, category_chain: Dict[str, str]) -> str:
        category_path = []
        for cat in ['categories', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6']:
            if category_chain.get(cat):
                category_path.append(str(category_chain[cat]))
        
        category_str = " > ".join(category_path)
        
        return f"""Generate a realistic product title and description for an item in this category path: {category_str}
        Make it specific and detailed, matching the category exactly.
        Format: 
        Title: [Product Title]
        Description: [Product Description]"""

    def generate_review_prompt(self, product_title: str, category_chain: Dict[str, str], rating: int) -> str:
        sentiment = "positive" if rating > 3 else "negative" if rating < 3 else "neutral"

        # Get average review length for this rating
        target_length = int(self.review_patterns['review_length_stats']['text_length']['mean'])

        # Sample a few existing reviews with similar sentiment to guide the generated review
        existing_reviews = self._sample_existing_reviews(sentiment)
        example_reviews = "\n".join([f"- {review}" for review in existing_reviews])

        category_context = [str(category_chain['categories'])]
        if category_chain.get('cat1'):
            category_context.append(str(category_chain['cat1']))
        if category_chain.get('cat2'):
            category_context.append(str(category_chain['cat2']))

        category_str = " - ".join(category_context)

        return f"""Generate a realistic {sentiment} product review for: {product_title}
        Product Category: {category_str}
        The review should have a {rating} star rating out of 5 stars.
        Make the review approximately {target_length} characters long.
        Make it sound natural and include specific details about the product that make sense for its category.
        Use similar style and tone as these examples:
        {example_reviews}
        Format:
        Title: [Review Title]
        Review: [Review Text]"""
    
    def _sample_existing_reviews(self, sentiment: str, num_samples: int = 3) -> List[str]:
        """Sample existing reviews from the dataset based on sentiment."""
        if sentiment == "positive":
            filtered_reviews = self.reviews_df[self.reviews_df['rating'] > 3]['text']
        elif sentiment == "negative":
            filtered_reviews = self.reviews_df[self.reviews_df['rating'] < 3]['text']
        else:
            filtered_reviews = self.reviews_df[self.reviews_df['rating'] == 3]['text']

        # Randomly sample a few reviews from the filtered set
        sampled_reviews = filtered_reviews.sample(n=min(num_samples, len(filtered_reviews)), random_state=42).tolist()
        return sampled_reviews
    
    def parse_llm_review_response(self, response_text: str) -> Dict[str, str]:
        """Parse the review title and text from the LLM response."""
        try:
            lines = response_text.strip().split('\n')
            title_line = next((line for line in lines if line.startswith("Title:")), "Title: [Unknown Review]")
            review_line = next((line for line in lines if line.startswith("Review:")), "Review: [No Text]")

            title = title_line.replace("Title:", "").strip()
            review = review_line.replace("Review:", "").strip()

            return {'title': title, 'review': review}
        except Exception as e:
            print(f"Error parsing LLM review response: {e}")
            return {'title': '[Error: Unable to parse title]', 'review': '[Error: Unable to parse review]'}
    
    def generate_products(self, num_products: int) -> pd.DataFrame:
        """Generate synthetic product data based on category chains."""
        products = []

        for _ in range(num_products):
            # Randomly choose a category chain for the product
            category_chain = random.choice(self.category_chains)

            # Generate product title and description using Cohere
            prompt = self.generate_product_prompt(category_chain)

            try:
                response = self.co.generate(
                    prompt=prompt,
                    max_tokens=100,
                    temperature=0.7,
                    k=0,
                    stop_sequences=["\n\n"],
                    return_likelihoods='NONE'
                )

                product_info = self.parse_llm_product_response(response.generations[0].text)

                product = {
                    'title': product_info['title'],
                    'description': product_info['description'],
                    'categories': category_chain.get('categories', ''),
                    'cat1': category_chain.get('cat1', ''),
                    'cat2': category_chain.get('cat2', ''),
                    'cat3': category_chain.get('cat3', ''),
                    'cat4': category_chain.get('cat4', ''),
                    'cat5': category_chain.get('cat5', ''),
                    'cat6': category_chain.get('cat6', ''),
                    'parent_asin': ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
                }

                products.append(product)

                # Sleep to respect rate limits
                time.sleep(0.5)

            except Exception as e:
                print(f"Error generating product: {e}")
                continue

        return pd.DataFrame(products)

    def parse_llm_product_response(self, response_text: str) -> Dict[str, str]:
        """Parse the product title and description from the LLM response."""
        lines = response_text.strip().split('\n')
        title_line = next((line for line in lines if line.startswith("Title:")), "Title: [Unknown Product]")
        description_line = next((line for line in lines if line.startswith("Description:")), "Description: [No Description]")

        title = title_line.replace("Title:", "").strip()
        description = description_line.replace("Description:", "").strip()

        return {'title': title, 'description': description}

    def generate_reviews(self, products_df: pd.DataFrame, max_reviews: int = 1000) -> pd.DataFrame:
        data = []
        total_reviews = 0  # Counter for the total number of reviews generated

        for _, product in products_df.iterrows():
            if total_reviews >= max_reviews:  # Stop if we've reached the maximum number of reviews
                break

            # Determine number of reviews for the current product
            mean_reviews = self.review_patterns['reviews_per_product']['mean']
            std_reviews = self.review_patterns['reviews_per_product']['std']
            num_reviews = int(max(1, np.random.normal(mean_reviews, std_reviews)))

            # Limit the number of reviews for the current product if necessary
            num_reviews = min(num_reviews, max_reviews - total_reviews)

            category_chain = {
                col: product[col] for col in 
                ['categories', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6']
            }

            for _ in range(num_reviews):
                try:
                    # Generate rating based on learned distribution
                    rating = np.random.choice(
                        list(self.review_patterns['rating_distribution'].keys()),
                        p=list(self.review_patterns['rating_distribution'].values())
                    )

                    # Generate review using Cohere
                    response = self.co.generate(
                        prompt=self.generate_review_prompt(
                            product['title'],
                            category_chain,
                            rating
                        ),
                        max_tokens=200,
                        temperature=0.8,
                        k=0,
                        stop_sequences=["\n\n"],
                        return_likelihoods='NONE'
                    )

                    review_info = self.parse_llm_review_response(response.generations[0].text)

                    # Generate helpful votes based on learned patterns
                    helpful_votes = max(0, int(np.random.normal(
                        self.review_patterns['avg_helpful_votes'],
                        self.review_patterns['avg_helpful_votes'] * 0.5
                    )))

                    # Generate verified purchase based on learned ratio
                    verified_purchase = random.random() < self.review_patterns['verified_purchase_ratio']

                    review = {
                        'rating': rating,
                        'title': review_info['title'],
                        'text': review_info['review'],
                        'asin': product['X'],
                        'parent_asin': product['parent_asin'],
                        'user_id': ''.join(random.choices(string.ascii_lowercase + string.digits, k=8)),
                        'helpful_vote': helpful_votes,
                        'verified_purchase': verified_purchase
                    }

                    data.append(review)
                    total_reviews += 1  # Increment the total reviews counter

                    # Sleep to respect rate limits
                    time.sleep(0.5)

                    if total_reviews >= max_reviews:  # Stop if we've reached the maximum number of reviews
                        break

                except Exception as e:
                    print(f"Error generating review: {e}")
                    continue

        return pd.DataFrame(data)


    def _generate_date_based_on_patterns(self) -> datetime.date:
        """Generate a date based on learned weekday distribution"""
        weekday = np.random.choice(
            list(self.review_patterns['temporal_patterns']['weekday_distribution'].keys()),
            p=list(self.review_patterns['temporal_patterns']['weekday_distribution'].values())
        )
        
        # Generate a date that falls on the chosen weekday
        base_date = datetime(2020, 1, 1).date()
        days_ahead = weekday - base_date.weekday()
        if days_ahead <= 0:
            days_ahead += 7
        return base_date + timedelta(days=days_ahead)

    def _generate_time_based_on_patterns(self) -> datetime.time:
        """Generate a time based on learned hour distribution"""
        hour = np.random.choice(
            list(self.review_patterns['temporal_patterns']['hour_distribution'].keys()),
            p=list(self.review_patterns['temporal_patterns']['hour_distribution'].values())
        )
        minute = random.randint(0, 59)
        second = random.randint(0, 59)
        return datetime.strptime(f"{hour}:{minute}:{second}", "%H:%M:%S").time()

    # ... (rest of the methods remain the same)

def main():
    # Get Cohere API key from environment variable
    cohere_api_key = "1P0fk49DfMfiIiIxiMOdEOmE82crNyxBeeT2kG8H"
    if not cohere_api_key:
        raise ValueError("Please set the COHERE_API_KEY environment variable")
    
    # Load existing data
    print("Loading existing data...")
    existing_products_df = pd.read_csv('/kaggle/input/assignment-reviews-metadata/product_asin.csv')
    existing_reviews_df = pd.read_csv('/kaggle/input/assignment-reviews-metadata/reviews_supplements.csv', parse_dates=['date', 'time'])
    
    # Initialize generator with existing data
    generator = LLMReviewGenerator(cohere_api_key, existing_products_df, existing_reviews_df)
    
    # Generate synthetic products
#     print("Generating products...")
#     num_products = 10  # Start with a small number for testing
#     products_df = generator.generate_products(num_products)
#     print(products_df)
    
    # Generate synthetic reviews
    print("Generating reviews...")
    reviews_df = generator.generate_reviews(existing_products_df, max_reviews=100)
    print(reviews_df)
    
    # Save to CSV
#     products_df.to_csv('synthetic_products.csv', index=False)
    reviews_df.to_csv('synthetic_reviews.csv', index=False)
    
#     print(f"Generated {len(products_df)} products and {len(reviews_df)} reviews")
#     print("Files saved as 'synthetic_products.csv' and 'synthetic_reviews.csv'")

In [9]:
if __name__ == "__main__":
    main()

Loading existing data...


  existing_reviews_df = pd.read_csv('/kaggle/input/assignment-reviews-metadata/reviews_supplements.csv', parse_dates=['date', 'time'])


Extracted 700 unique category chains
Analyzed review patterns across 16671 reviews
Generating reviews...
    rating                                              title  \
0        5                                   [Unknown Review]   
1        4                              A Miracle for Rashes!   
2        5                                   [Unknown Review]   
3        5                                   [Unknown Review]   
4        5                                   [Unknown Review]   
..     ...                                                ...   
95       5  InvoSpa Shiatsu Back Shoulder and Neck Massage...   
96       5  InvoSpa Shiatsu Back Shoulder and Neck Massage...   
97       5  InvoSpa Shiatsu Back Shoulder and Neck Massage...   
98       1                                   [Unknown Review]   
99       1                                   [Unknown Review]   

                                                 text  asin parent_asin  \
0   Allegra Allergy is a miracle! I've 