In [50]:
import json
import pandas as pd
from tqdm import tqdm
from langchain_community.llms import Ollama

# Dataset

In [59]:
def read_json_array(path):
  data = []
  with open(path, 'r') as file:
    for line in file:
      json_object = json.loads(line.strip())
      data.append(json_object)
  return data

dataset_path = '../../datasets/amazon_reviews/'
data_path = dataset_path + "Musical_Instruments.json"

data = read_json_array(data_path)
df = pd.DataFrame(data)

In [60]:
amazon_reviews = pd.DataFrame()
amazon_reviews['Reviews'] = df['reviewText'].copy()
amazon_reviews['Ratings'] = df['overall'].copy()
amazon_reviews = amazon_reviews.head(1000)

# Modeling

In [4]:
llm = Ollama(model="gemma2")

  llm = Ollama(model="gemma2")


### Step 1 - Creating Topics for batches of dataset

In [5]:
def generate_topics_in_batches(amazon_reviews, batch_size=100, llm=None):
    """
    Generate topics from Amazon reviews in batches to handle large datasets.
    
    Parameters:
    amazon_reviews (pd.DataFrame): DataFrame containing Amazon reviews
    batch_size (int): Number of reviews to process in each batch
    llm: The language model instance to use for generating topics
    
    Returns:
    list: List of lists containing topics for each batch
    """
    if llm is None:
        raise ValueError("LLM instance must be provided")
    
    all_batch_topics = []
    total_reviews = len(amazon_reviews)
    
    # Process reviews in batches
    for start_idx in tqdm(range(0, total_reviews, batch_size)):
        end_idx = min(start_idx + batch_size, total_reviews)
        batch_reviews = amazon_reviews.iloc[start_idx:end_idx]
        
        # Generate prompt for current batch
        prompt_labels_generator = f'''You are provided with amazon reviews on the musical instruments and helping to cluster the reviews based on the topics.
Please create topics based on the reviews provided. Keep the topics general and not specific to the reviews.
Amazon Reviews: {batch_reviews['Reviews'].tolist()}
Please return only in CSV format with the following structure:
Topic1, Topic2, Topic3, Topic4, Topic5,...
Return only the topics in CSV format and nothing else.
'''
        
        # Get topics for current batch
        result = llm.invoke(prompt_labels_generator, temperature=0.0)
        
        # Convert CSV string to list of topics
        batch_topics = [topic.strip() for topic in result.split(',')]
        all_batch_topics.append(batch_topics)
        
        print(f"Processed batch {len(all_batch_topics)}: reviews {start_idx} to {end_idx}")
    
    return all_batch_topics

In [6]:
topics_by_batch = generate_topics_in_batches(amazon_reviews, batch_size=100, llm=llm)

Processed batch 1: reviews 0 to 100
Processed batch 2: reviews 100 to 200
Processed batch 3: reviews 200 to 300
Processed batch 4: reviews 300 to 400
Processed batch 5: reviews 400 to 500
Processed batch 6: reviews 500 to 600
Processed batch 7: reviews 600 to 700
Processed batch 8: reviews 700 to 800
Processed batch 9: reviews 800 to 900
Processed batch 10: reviews 900 to 1000


In [10]:
all_unique_topics = list(set([topic for batch in topics_by_batch for topic in batch]))

### Step 2 - Combining all topics to a smaller more general subset

In [12]:
prompt_merge_topics = f'''You are provided with topics generated from Amazon reviews on musical instruments.
Please merge the topics into a smaller number of topics. The topics should be general and not specific to the reviews.
Topics: {all_unique_topics}
Please return only in CSV format with the following structure:
MergedTopic1, MergedTopic2, MergedTopic3, MergedTopic4, MergedTopic5,...
Return only the merged topics in CSV format and nothing else.
'''

# Get merged topics
proper_topics_str = llm.invoke(prompt_merge_topics, temperature=0.0)

In [15]:
proper_topics = [topic.strip() for topic in proper_topics_str.split(',')]
print(proper_topics)

['Sound Quality', 'Ease of Use', 'Value for Money', 'Learning Resources', 'Aesthetics', 'Durability', 'Overall Satisfaction']


In [25]:
print(proper_topics)


['Sound Quality', 'Ease of Use', 'Value for Money', 'Learning Resources', 'Aesthetics', 'Durability', 'Overall Satisfaction']


### Step 3 - Assigning batches of dataset to Topics Generated in Step 2 and sentiment of the review

In [112]:
def assign_topics_in_batches(input_df, topics, batch_size, llm=None):
    """
    Assign topics and sentiment to Amazon reviews in batches and update DataFrame directly.

    Parameters:
    input_df (pd.DataFrame): DataFrame containing Amazon reviews
    topics (str): String of topics to assign from
    batch_size (int): Number of reviews to process in each batch
    llm: The language model instance to use for assigning topics

    Returns:
    pd.DataFrame: Updated DataFrame with topic and sentiment assignments
    """
    if llm is None:
        raise ValueError("LLM instance must be provided")

    # Create a copy of the DataFrame to avoid modifying the original
    df = input_df.copy()

    # Initialize Topic and Sentiment columns with 'Unknown'
    df['Topic'] = 'Unknown'
    df['Sentiment'] = 'Unknown'

    total_reviews = len(df)

    # Process reviews in batches with progress bar
    for start_idx in tqdm(range(0, total_reviews, batch_size), desc="Assigning topics"):
        end_idx = min(start_idx + batch_size, total_reviews)
        batch_reviews_list = df.iloc[start_idx:end_idx]
        batch_reviews = " ".join([f"Comment {i + 1}: {review}," for i, review in enumerate(batch_reviews_list['Reviews'])])
        
        # Generate prompt for current batch
        prompt_assigning_prompt = f'''You are provided with amazon reviews on the musical instruments and helping to cluster the reviews based on the topics.
Please assign the reviews to the topics provided. Return only the name of the topic and sentiment for the respective reviews. Sentiment can be only Positive, Negative or Neutral.
Amazon Reviews: {batch_reviews}
Topics: {topics}
Please return in JSON format only topics and sentiment for respective reviews and nothing else. Do not use triple backtick blocks. Only output exactly as on the example below:
Example: Having an input of Review1, Review2, Review3, Review4
Output: [{{"topic": "Topic1", "sentiment": "Sentiment1"}}, {{"topic": "Topic2", "sentiment": "Sentiment2"}}, {{"topic": "Topic3", "sentiment": "Sentiment3"}}]
'''

        # Get assignments for current batch
        result = llm.invoke(prompt_assigning_prompt, temperature=0.0)

        try:
            # Parse JSON response
            assignments = json.loads(result)

            # Update DataFrame directly using indices
            for idx, assignment in enumerate(assignments):
                current_idx = start_idx + idx
                if current_idx < total_reviews:
                    topic = assignment.get('topic', 'Unknown')
                    sentiment = assignment.get('sentiment', 'Unknown')

                    df.at[current_idx, 'Topic'] = topic
                    df.at[current_idx, 'Sentiment'] = sentiment

        except json.JSONDecodeError as e:
            print(f"\nError parsing JSON for batch starting at index {start_idx}: {str(e)}")
            print(f"Raw response: {result}")
            continue
        except Exception as e:
            print(f"\nUnexpected error processing batch starting at index {start_idx}: {str(e)}")
            continue

    return df


In [113]:
amazon_reviews_with_topics = assign_topics_in_batches(
    amazon_reviews.head(11),
    topics=str(proper_topics),
    batch_size=5,
    llm=llm
)

Assigning topics: 100%|██████████| 3/3 [00:12<00:00,  4.18s/it]


In [114]:
amazon_reviews_with_topics

Unnamed: 0,Reviews,Ratings,Topic,Sentiment
0,Crocheting for Dummies by Karen Manthey & Susa...,5.0,Learning Resources,Positive
1,Very helpful...,4.0,Ease of Use,Positive
2,EASY TO UNDERSTAND AND A PROMPT SERVICE TOO,5.0,Overall Satisfaction,Positive
3,My girlfriend use quite often,4.0,Value for Money,Positive
4,Arrived as described. Very happy.,5.0,Overall Satisfaction,Positive
5,Love the Dummies Series. Never fails.,5.0,Learning Resources,Positive
6,Good book.,5.0,Learning Resources,Positive
7,Just started reading it. Love the charts & cau...,4.0,Learning Resources,Positive
8,GREAT book,4.0,Learning Resources,Positive
9,this is a very helpful book.,5.0,Learning Resources,Positive


In [31]:
amazon_reviews_with_topics.to_csv('amazon_reviews_with_topics.csv', index=False)

In [115]:
amazon_reviews_with_topics['Topic'].value_counts()

Topic
Learning Resources      7
Overall Satisfaction    2
Ease of Use             1
Value for Money         1
Name: count, dtype: int64