In [1]:
import json
import pandas as pd
from tqdm import tqdm
from langchain_community.llms import Ollama

# Dataset

In [4]:
def read_json_array(path):
  data = []
  with open(path, 'r') as file:
    for line in file:
      json_object = json.loads(line.strip())
      data.append(json_object)
  return data

dataset_path = '../../datasets/amazon_reviews/'
data_path = dataset_path + "Musical_Instruments.json"

data = read_json_array(data_path)
df = pd.DataFrame(data)

In [5]:
amazon_reviews = pd.DataFrame()
amazon_reviews['Reviews'] = df['reviewText'].copy()
amazon_reviews['Ratings'] = df['overall'].copy()
amazon_reviews = amazon_reviews.sample(n=10000, random_state=42).reset_index(drop=True)

# Modeling

In [6]:
llm = Ollama(model="gemma2")

  llm = Ollama(model="gemma2")


### Step 1 - Creating Topics for batches of dataset

In [37]:
def generate_topics_in_batches(amazon_reviews, batch_size=100, llm=None):
    """
    Generate topics from Amazon reviews in batches to handle large datasets.
    
    Parameters:
    amazon_reviews (pd.DataFrame): DataFrame containing Amazon reviews
    batch_size (int): Number of reviews to process in each batch
    llm: The language model instance to use for generating topics
    
    Returns:
    list: List of lists containing topics for each batch
    """
    if llm is None:
        raise ValueError("LLM instance must be provided")
    
    all_batch_topics = []
    total_reviews = len(amazon_reviews)
    
    # Process reviews in batches
    for start_idx in tqdm(range(0, total_reviews, batch_size)):
        end_idx = min(start_idx + batch_size, total_reviews)
        batch_reviews = amazon_reviews.iloc[start_idx:end_idx]
        
        # Generate prompt for current batch
        prompt_labels_generator = f'''You are provided with amazon reviews and helping to cluster the reviews based on the topics.
Please create topics based on the reviews provided. Keep the topics general and not specific to the reviews. 
DO NOT include any specific details. NO TOPICS LIKE  DJ Equipment, Guitar, Piano, etc. Topics should be general and helpful for business steakholders to understand what people are thinking about the products,
Amazon Reviews: {batch_reviews['Reviews'].tolist()}
Please return only in CSV format with the following structure:
Topic1, Topic2, Topic3, Topic4, Topic5,...
Return only the topics in CSV format and nothing else.
'''

        # Get topics for current batch
        result = llm.invoke(prompt_labels_generator, temperature=0.0)
        
        # Convert CSV string to list of topics
        batch_topics = [topic.strip() for topic in result.split(',')]
        all_batch_topics.append(batch_topics)
        
        print(f"Processed batch {len(all_batch_topics)}: reviews {start_idx} to {end_idx}")
    
    return all_batch_topics

In [38]:
topics_by_batch = generate_topics_in_batches(amazon_reviews.sample(1000, random_state=42), batch_size=10, llm=llm)

  1%|          | 1/100 [00:04<08:06,  4.92s/it]

Processed batch 1: reviews 0 to 10


  2%|▏         | 2/100 [00:09<07:51,  4.81s/it]

Processed batch 2: reviews 10 to 20


  3%|▎         | 3/100 [00:13<07:23,  4.57s/it]

Processed batch 3: reviews 20 to 30


  4%|▍         | 4/100 [00:19<08:05,  5.05s/it]

Processed batch 4: reviews 30 to 40


  5%|▌         | 5/100 [00:23<07:03,  4.46s/it]

Processed batch 5: reviews 40 to 50


  6%|▌         | 6/100 [00:25<05:58,  3.81s/it]

Processed batch 6: reviews 50 to 60


  7%|▋         | 7/100 [00:28<05:15,  3.39s/it]

Processed batch 7: reviews 60 to 70


  8%|▊         | 8/100 [00:30<04:44,  3.09s/it]

Processed batch 8: reviews 70 to 80


  9%|▉         | 9/100 [00:34<05:16,  3.48s/it]

Processed batch 9: reviews 80 to 90


 10%|█         | 10/100 [00:39<05:37,  3.75s/it]

Processed batch 10: reviews 90 to 100


 11%|█         | 11/100 [00:43<05:41,  3.83s/it]

Processed batch 11: reviews 100 to 110


 12%|█▏        | 12/100 [00:46<05:14,  3.57s/it]

Processed batch 12: reviews 110 to 120


 13%|█▎        | 13/100 [00:49<05:00,  3.46s/it]

Processed batch 13: reviews 120 to 130


 14%|█▍        | 14/100 [00:52<04:43,  3.30s/it]

Processed batch 14: reviews 130 to 140


 15%|█▌        | 15/100 [00:57<05:28,  3.87s/it]

Processed batch 15: reviews 140 to 150


 16%|█▌        | 16/100 [01:03<06:02,  4.32s/it]

Processed batch 16: reviews 150 to 160


 17%|█▋        | 17/100 [01:08<06:21,  4.59s/it]

Processed batch 17: reviews 160 to 170


 18%|█▊        | 18/100 [01:11<05:32,  4.06s/it]

Processed batch 18: reviews 170 to 180


 19%|█▉        | 19/100 [01:18<06:44,  4.99s/it]

Processed batch 19: reviews 180 to 190


 20%|██        | 20/100 [01:20<05:42,  4.29s/it]

Processed batch 20: reviews 190 to 200


 21%|██        | 21/100 [01:24<05:20,  4.06s/it]

Processed batch 21: reviews 200 to 210


 22%|██▏       | 22/100 [01:27<04:46,  3.67s/it]

Processed batch 22: reviews 210 to 220


 23%|██▎       | 23/100 [01:31<04:53,  3.81s/it]

Processed batch 23: reviews 220 to 230


 24%|██▍       | 24/100 [01:34<04:27,  3.51s/it]

Processed batch 24: reviews 230 to 240


 25%|██▌       | 25/100 [01:40<05:27,  4.37s/it]

Processed batch 25: reviews 240 to 250


 26%|██▌       | 26/100 [01:47<06:24,  5.19s/it]

Processed batch 26: reviews 250 to 260


 27%|██▋       | 27/100 [01:50<05:27,  4.48s/it]

Processed batch 27: reviews 260 to 270


 28%|██▊       | 28/100 [01:54<05:24,  4.50s/it]

Processed batch 28: reviews 270 to 280


 29%|██▉       | 29/100 [01:59<05:10,  4.38s/it]

Processed batch 29: reviews 280 to 290


 30%|███       | 30/100 [02:03<05:12,  4.46s/it]

Processed batch 30: reviews 290 to 300


 31%|███       | 31/100 [02:06<04:31,  3.94s/it]

Processed batch 31: reviews 300 to 310


 32%|███▏      | 32/100 [02:09<04:13,  3.73s/it]

Processed batch 32: reviews 310 to 320


 33%|███▎      | 33/100 [02:13<04:13,  3.78s/it]

Processed batch 33: reviews 320 to 330


 34%|███▍      | 34/100 [02:15<03:32,  3.22s/it]

Processed batch 34: reviews 330 to 340


 35%|███▌      | 35/100 [02:19<03:39,  3.38s/it]

Processed batch 35: reviews 340 to 350


 36%|███▌      | 36/100 [02:22<03:32,  3.32s/it]

Processed batch 36: reviews 350 to 360


 37%|███▋      | 37/100 [02:27<03:55,  3.75s/it]

Processed batch 37: reviews 360 to 370


 38%|███▊      | 38/100 [02:31<03:59,  3.86s/it]

Processed batch 38: reviews 370 to 380


 39%|███▉      | 39/100 [02:37<04:34,  4.50s/it]

Processed batch 39: reviews 380 to 390


 40%|████      | 40/100 [02:45<05:30,  5.52s/it]

Processed batch 40: reviews 390 to 400


 41%|████      | 41/100 [02:49<05:11,  5.28s/it]

Processed batch 41: reviews 400 to 410


 42%|████▏     | 42/100 [02:54<04:45,  4.93s/it]

Processed batch 42: reviews 410 to 420


 43%|████▎     | 43/100 [02:58<04:29,  4.73s/it]

Processed batch 43: reviews 420 to 430


 44%|████▍     | 44/100 [03:00<03:50,  4.12s/it]

Processed batch 44: reviews 430 to 440


 45%|████▌     | 45/100 [03:05<03:58,  4.34s/it]

Processed batch 45: reviews 440 to 450


 46%|████▌     | 46/100 [03:07<03:09,  3.51s/it]

Processed batch 46: reviews 450 to 460


 47%|████▋     | 47/100 [03:18<05:05,  5.77s/it]

Processed batch 47: reviews 460 to 470


 48%|████▊     | 48/100 [03:24<05:07,  5.91s/it]

Processed batch 48: reviews 470 to 480


 49%|████▉     | 49/100 [03:29<04:37,  5.44s/it]

Processed batch 49: reviews 480 to 490


 50%|█████     | 50/100 [03:33<04:24,  5.29s/it]

Processed batch 50: reviews 490 to 500


 51%|█████     | 51/100 [03:36<03:45,  4.60s/it]

Processed batch 51: reviews 500 to 510


 52%|█████▏    | 52/100 [03:40<03:21,  4.21s/it]

Processed batch 52: reviews 510 to 520


 53%|█████▎    | 53/100 [03:43<02:57,  3.78s/it]

Processed batch 53: reviews 520 to 530


 54%|█████▍    | 54/100 [03:50<03:39,  4.76s/it]

Processed batch 54: reviews 530 to 540


 55%|█████▌    | 55/100 [03:54<03:35,  4.78s/it]

Processed batch 55: reviews 540 to 550


 56%|█████▌    | 56/100 [03:59<03:30,  4.77s/it]

Processed batch 56: reviews 550 to 560


 57%|█████▋    | 57/100 [04:03<03:12,  4.48s/it]

Processed batch 57: reviews 560 to 570


 58%|█████▊    | 58/100 [04:05<02:43,  3.89s/it]

Processed batch 58: reviews 570 to 580


 59%|█████▉    | 59/100 [04:11<02:56,  4.29s/it]

Processed batch 59: reviews 580 to 590


 60%|██████    | 60/100 [04:15<02:46,  4.15s/it]

Processed batch 60: reviews 590 to 600


 61%|██████    | 61/100 [04:17<02:17,  3.54s/it]

Processed batch 61: reviews 600 to 610


 62%|██████▏   | 62/100 [04:20<02:16,  3.61s/it]

Processed batch 62: reviews 610 to 620


 63%|██████▎   | 63/100 [04:25<02:19,  3.76s/it]

Processed batch 63: reviews 620 to 630


 64%|██████▍   | 64/100 [04:33<03:02,  5.08s/it]

Processed batch 64: reviews 630 to 640


 65%|██████▌   | 65/100 [04:36<02:36,  4.48s/it]

Processed batch 65: reviews 640 to 650


 66%|██████▌   | 66/100 [04:47<03:38,  6.42s/it]

Processed batch 66: reviews 650 to 660


 67%|██████▋   | 67/100 [04:50<03:05,  5.62s/it]

Processed batch 67: reviews 660 to 670


 68%|██████▊   | 68/100 [04:54<02:41,  5.06s/it]

Processed batch 68: reviews 670 to 680


 69%|██████▉   | 69/100 [04:58<02:23,  4.64s/it]

Processed batch 69: reviews 680 to 690


 70%|███████   | 70/100 [05:02<02:15,  4.51s/it]

Processed batch 70: reviews 690 to 700


 71%|███████   | 71/100 [05:07<02:14,  4.64s/it]

Processed batch 71: reviews 700 to 710


 72%|███████▏  | 72/100 [05:13<02:17,  4.90s/it]

Processed batch 72: reviews 710 to 720


 73%|███████▎  | 73/100 [05:15<01:50,  4.08s/it]

Processed batch 73: reviews 720 to 730


 74%|███████▍  | 74/100 [05:17<01:28,  3.42s/it]

Processed batch 74: reviews 730 to 740


 75%|███████▌  | 75/100 [05:20<01:28,  3.53s/it]

Processed batch 75: reviews 740 to 750


 76%|███████▌  | 76/100 [05:22<01:13,  3.07s/it]

Processed batch 76: reviews 750 to 760


 77%|███████▋  | 77/100 [05:29<01:34,  4.12s/it]

Processed batch 77: reviews 760 to 770


 78%|███████▊  | 78/100 [05:33<01:31,  4.17s/it]

Processed batch 78: reviews 770 to 780


 79%|███████▉  | 79/100 [05:36<01:19,  3.79s/it]

Processed batch 79: reviews 780 to 790


 80%|████████  | 80/100 [05:39<01:09,  3.47s/it]

Processed batch 80: reviews 790 to 800


 81%|████████  | 81/100 [05:41<01:01,  3.23s/it]

Processed batch 81: reviews 800 to 810


 82%|████████▏ | 82/100 [05:45<00:57,  3.19s/it]

Processed batch 82: reviews 810 to 820


 83%|████████▎ | 83/100 [05:48<00:56,  3.31s/it]

Processed batch 83: reviews 820 to 830


 84%|████████▍ | 84/100 [05:51<00:52,  3.26s/it]

Processed batch 84: reviews 830 to 840


 85%|████████▌ | 85/100 [05:55<00:49,  3.31s/it]

Processed batch 85: reviews 840 to 850


 86%|████████▌ | 86/100 [06:00<00:52,  3.75s/it]

Processed batch 86: reviews 850 to 860


 87%|████████▋ | 87/100 [06:04<00:49,  3.83s/it]

Processed batch 87: reviews 860 to 870


 88%|████████▊ | 88/100 [06:11<01:00,  5.00s/it]

Processed batch 88: reviews 870 to 880


 89%|████████▉ | 89/100 [06:19<01:05,  5.96s/it]

Processed batch 89: reviews 880 to 890


 90%|█████████ | 90/100 [06:23<00:52,  5.29s/it]

Processed batch 90: reviews 890 to 900


 91%|█████████ | 91/100 [06:26<00:40,  4.49s/it]

Processed batch 91: reviews 900 to 910


 92%|█████████▏| 92/100 [06:30<00:34,  4.29s/it]

Processed batch 92: reviews 910 to 920


 93%|█████████▎| 93/100 [06:36<00:33,  4.78s/it]

Processed batch 93: reviews 920 to 930


 94%|█████████▍| 94/100 [06:38<00:24,  4.02s/it]

Processed batch 94: reviews 930 to 940


 95%|█████████▌| 95/100 [06:41<00:19,  3.80s/it]

Processed batch 95: reviews 940 to 950


 96%|█████████▌| 96/100 [06:47<00:17,  4.31s/it]

Processed batch 96: reviews 950 to 960


 97%|█████████▋| 97/100 [06:51<00:13,  4.44s/it]

Processed batch 97: reviews 960 to 970


 98%|█████████▊| 98/100 [06:55<00:08,  4.30s/it]

Processed batch 98: reviews 970 to 980


 99%|█████████▉| 99/100 [06:57<00:03,  3.62s/it]

Processed batch 99: reviews 980 to 990


100%|██████████| 100/100 [07:01<00:00,  4.21s/it]

Processed batch 100: reviews 990 to 1000





In [39]:
all_unique_topics = list(set([topic for batch in topics_by_batch for topic in batch]))

In [41]:
all_unique_topics

['Technical Aspects',
 'User Experience',
 'Sound & Performance',
 'Guitar Straps',
 'Sound/Performance',
 'Aesthetics',
 'Guitar Effects Pedals',
 'Impedance Matching',
 'Customer Experience',
 'Product Durability',
 'Performance & Functionality',
 'Price Value',
 'Durability & Performance',
 'Features',
 'Value',
 'Comfort',
 'Shipping & Delivery',
 'Microphone Sensitivity',
 'Quality',
 'Learning Curve',
 'Size',
 'Usage Experience',
 'Vocal Harmonizers',
 'Product Quality',
 'Size & Portability',
 'Value for Money',
 'Price',
 'Performance Issues',
 'Guitar Amplifiers',
 'Usage',
 'Aesthetics & Comfort',
 'Performance',
 'Delivery Experience',
 'Customer Satisfaction',
 'Installation & Setup',
 'Target Audience',
 'Comfort/Design',
 'Fit & Functionality',
 'Brand Perception',
 'Experience',
 'Cello Pickup Installation',
 'Shipping Experience',
 'Compatibility',
 'Customer Service',
 'Product Performance',
 'Usability',
 'Quality & Durability',
 'Performance Quality',
 'Shipping & P

### Step 2 - Combining all topics to a smaller more general subset

In [40]:
prompt_merge_topics = f'''You are provided with topics generated from Amazon reviews.
Please merge the topics into a smaller number of topics. The topics should be VERY GENERAL and NOT specific. Do not output more than 15 topics. 
Merge/combine topics that are similar or related.
Topics: {all_unique_topics}
Please return only in CSV format with the following structure:
MergedTopic1, MergedTopic2, MergedTopic3, MergedTopic4, MergedTopic5,...
Return only the merged topics in CSV format and nothing else.
'''

# Get merged topics
proper_topics_str = llm.invoke(prompt_merge_topics, temperature=0.0)

In [43]:
proper_topics = [topic.strip() for topic in proper_topics_str.split(',')]
print(proper_topics)

['Technical Aspects', 'User Experience', 'Sound & Performance', 'Aesthetics', 'Durability', 'Price & Value', 'Customer Experience', 'Size & Portability', 'Functionality & Usability', 'Shipping & Delivery', 'Brand Perception', 'Product Quality', 'Comfort & Design', 'Learning Curve', 'Target Audience', 'Usage Experience', 'Gift Giving', 'Emotional Response', 'Compatibility', 'Customer Service']


### Step 3 - Assigning batches of dataset to Topics Generated in Step 2 and sentiment of the review

In [44]:
def assign_topics_in_batches(input_df, topics, batch_size, llm=None):
    """
    Assign topics and sentiment to Amazon reviews in batches and update DataFrame directly.

    Parameters:
    input_df (pd.DataFrame): DataFrame containing Amazon reviews
    topics (str): String of topics to assign from
    batch_size (int): Number of reviews to process in each batch
    llm: The language model instance to use for assigning topics

    Returns:
    pd.DataFrame: Updated DataFrame with topic and sentiment assignments
    """
    if llm is None:
        raise ValueError("LLM instance must be provided")

    # Create a copy of the DataFrame to avoid modifying the original
    df = input_df.copy()

    # Initialize Topic and Sentiment columns with 'Unknown'
    df['Topic'] = 'Unknown'
    df['Sentiment'] = 'Unknown'

    total_reviews = len(df)

    # Process reviews in batches with progress bar
    for start_idx in tqdm(range(0, total_reviews, batch_size), desc="Assigning topics"):
        end_idx = min(start_idx + batch_size, total_reviews)
        batch_reviews_list = df.iloc[start_idx:end_idx]
        batch_reviews = " ".join([f"Comment {i + 1}: {review}," for i, review in enumerate(batch_reviews_list['Reviews'])])
        
        # Generate prompt for current batch
        prompt_assigning_prompt = f'''You are provided with amazon reviews and helping to cluster the reviews based on the topics.
Please assign the review to the topics provided. Return only the name of the topic and sentiment for the review. Sentiment can be only Positive, Negative or Neutral.
Amazon Review: {batch_reviews}
Topics: {topics}
Please return in JSON format only topics and sentiment for respective reviews and nothing else. Do not use triple backtick blocks. Only output exactly as on the example below:
Example: Having an input of Review1, Review2, Review3, Review4
Output: [{{"topic": "Topic1", "sentiment": "Sentiment1"}}, {{"topic": "Topic2", "sentiment": "Sentiment2"}}, {{"topic": "Topic3", "sentiment": "Sentiment3"}}]
'''

        # Get assignments for current batch
        result = llm.invoke(prompt_assigning_prompt, temperature=0.0)

        try:
            # Parse JSON response
            assignments = json.loads(result)

            # Update DataFrame directly using indices
            for idx, assignment in enumerate(assignments):
                current_idx = start_idx + idx
                if current_idx < total_reviews:
                    topic = assignment.get('topic', 'Unknown')
                    sentiment = assignment.get('sentiment', 'Unknown')

                    df.at[current_idx, 'Topic'] = topic
                    df.at[current_idx, 'Sentiment'] = sentiment

        except json.JSONDecodeError as e:
            print(f"\nError parsing JSON for batch starting at index {start_idx}: {str(e)}")
            print(f"Raw response: {result}")
            continue
        except Exception as e:
            print(f"\nUnexpected error processing batch starting at index {start_idx}: {str(e)}")
            continue

    return df


In [45]:
amazon_reviews_with_topics = assign_topics_in_batches(
    amazon_reviews,
    topics=str(proper_topics),
    batch_size=5,
    llm=llm
)

Assigning topics:   0%|          | 1/2000 [00:06<3:22:40,  6.08s/it]


Error parsing JSON for batch starting at index 0: Expecting value: line 1 column 299 (char 298)
Raw response: [{"topic": "User Experience", "sentiment": "Negative"}, {"topic": "Product Quality", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Neutral"},] 



Assigning topics:   1%|          | 23/2000 [02:11<3:00:02,  5.46s/it]


Error parsing JSON for batch starting at index 110: Expecting value: line 1 column 284 (char 283)
Raw response: [{"topic": "Durability", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Positive"}, {"topic": "Learning Curve", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Positive"},] 



Assigning topics:   7%|▋         | 135/2000 [13:54<2:57:49,  5.72s/it]


Error parsing JSON for batch starting at index 670: Expecting value: line 1 column 240 (char 239)
Raw response: [{"topic": "Customer Experience", "sentiment": "Positive"}, {"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Neutral"}, {"topic": "Sound & Performance", "sentiment": "Positive"},  ] 



Assigning topics:   9%|▉         | 175/2000 [18:09<2:49:06,  5.56s/it]


Error parsing JSON for batch starting at index 870: Expecting value: line 1 column 220 (char 219)
Raw response: [{"topic": "Technical Aspects", "sentiment": "Negative"}, {"topic": "Price & Value", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Negative"}, {"topic": "Aesthetics", "sentiment": "Positive"},] 



Assigning topics:   9%|▉         | 184/2000 [19:06<3:08:14,  6.22s/it]


Error parsing JSON for batch starting at index 915: Expecting value: line 1 column 226 (char 225)
Raw response: [{"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Durability", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Positive"}, {"topic": "Customer Experience", "sentiment": "Negative"},  ] 



Assigning topics:   9%|▉         | 185/2000 [19:13<3:11:17,  6.32s/it]


Error parsing JSON for batch starting at index 920: Expecting value: line 1 column 283 (char 282)
Raw response: [{"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Aesthetics", "sentiment": "Positive"}, {"topic": "Durability", "sentiment": "Neutral"},] 



Assigning topics:  10%|▉         | 191/2000 [19:50<3:05:12,  6.14s/it]


Error parsing JSON for batch starting at index 950: Expecting value: line 1 column 232 (char 231)
Raw response: [{"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Durability", "sentiment": "Negative"}, {"topic": "Size & Portability", "sentiment": "Neutral"}, {"topic": "Functionality & Usability", "sentiment": "Positive"},] 



Assigning topics:  11%|█▏        | 225/2000 [23:36<3:00:32,  6.10s/it]


Error parsing JSON for batch starting at index 1120: Expecting value: line 1 column 237 (char 236)
Raw response: [{"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Negative"},  ] 



Assigning topics:  12%|█▏        | 238/2000 [24:55<2:56:37,  6.01s/it]


Error parsing JSON for batch starting at index 1185: Expecting value: line 1 column 241 (char 240)
Raw response: [{"topic": "Technical Aspects", "sentiment": "Negative"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Neutral"}, {"topic": "Customer Experience", "sentiment": "Positive"}, ] 



Assigning topics:  13%|█▎        | 251/2000 [26:05<2:32:28,  5.23s/it]


Error parsing JSON for batch starting at index 1250: Expecting value: line 1 column 226 (char 225)
Raw response: [{"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Durability", "sentiment": "Negative"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Positive"},  ] 



Assigning topics:  14%|█▍        | 284/2000 [29:24<2:42:19,  5.68s/it]


Error parsing JSON for batch starting at index 1415: Expecting value: line 1 column 229 (char 228)
Raw response: [{"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Aesthetics", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Neutral"}, {"topic": "Product Quality", "sentiment": "Negative"},] 



Assigning topics:  19%|█▉        | 387/2000 [39:48<2:50:41,  6.35s/it]


Error parsing JSON for batch starting at index 1930: Expecting value: line 1 column 225 (char 224)
Raw response: [{"topic": "Aesthetics", "sentiment": "Positive"}, {"topic": "Durability", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Negative"}, {"topic": "Functionality & Usability", "sentiment": "Positive"},] 



Assigning topics:  20%|█▉        | 399/2000 [41:02<2:36:44,  5.87s/it]


Error parsing JSON for batch starting at index 1990: Expecting value: line 1 column 231 (char 230)
Raw response: [{"topic": "Product Quality", "sentiment": "Positive"}, {"topic": "Price & Value", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "User Experience", "sentiment": "Negative"},  ] 



Assigning topics:  21%|██        | 419/2000 [43:20<2:31:32,  5.75s/it]


Error parsing JSON for batch starting at index 2090: Expecting value: line 1 column 226 (char 225)
Raw response: [{"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Positive"}, {"topic": "Price & Value", "sentiment": "Positive"}, {"topic": "Durability", "sentiment": "Negative"},  ] 



Assigning topics:  25%|██▍       | 498/2000 [52:04<2:50:12,  6.80s/it]


Error parsing JSON for batch starting at index 2485: Expecting value: line 1 column 293 (char 292)
Raw response: [{"topic": "Technical Aspects", "sentiment": "Negative"}, {"topic": "Functionality & Usability", "sentiment": "Negative"}, {"topic": "Size & Portability", "sentiment": "Negative"}, {"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Positive"},  ] 



Assigning topics:  26%|██▌       | 511/2000 [53:27<2:42:53,  6.56s/it]


Error parsing JSON for batch starting at index 2550: Expecting value: line 1 column 237 (char 236)
Raw response: [{"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Customer Experience", "sentiment": "Negative"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Price & Value", "sentiment": "Positive"},] 



Assigning topics:  30%|███       | 601/2000 [1:02:46<2:24:20,  6.19s/it]


Error parsing JSON for batch starting at index 3000: Expecting value: line 1 column 290 (char 289)
Raw response: [{"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Aesthetics", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Negative"}, {"topic": "Functionality & Usability", "sentiment": "Neutral"}, {"topic": "Sound & Performance", "sentiment": "Negative"},  ] 



Assigning topics:  32%|███▏      | 635/2000 [1:06:19<2:20:28,  6.17s/it]


Error parsing JSON for batch starting at index 3170: Expecting value: line 1 column 277 (char 276)
Raw response: [{"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Aesthetics", "sentiment": "Positive"}, {"topic": "Price & Value", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Positive"},] 



Assigning topics:  36%|███▋      | 728/2000 [1:15:47<1:57:00,  5.52s/it]


Error parsing JSON for batch starting at index 3635: Expecting value: line 1 column 228 (char 227)
Raw response: [{"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Positive"}, {"topic": "Size & Portability", "sentiment": "Positive"},] 



Assigning topics:  40%|████      | 802/2000 [1:23:51<1:58:52,  5.95s/it]


Error parsing JSON for batch starting at index 4005: Expecting value: line 1 column 238 (char 237)
Raw response: [{"topic": "Size & Portability", "sentiment": "Negative"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Negative"},] 



Assigning topics:  49%|████▊     | 973/2000 [1:41:04<1:30:12,  5.27s/it]


Error parsing JSON for batch starting at index 4860: Expecting value: line 1 column 226 (char 225)
Raw response: [{"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Durability", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Negative"},] 



Assigning topics:  50%|█████     | 1000/2000 [1:43:51<1:47:18,  6.44s/it]


Error parsing JSON for batch starting at index 4995: Expecting value: line 1 column 288 (char 287)
Raw response: [{"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Positive"}, {"topic": "Price & Value", "sentiment": "Negative"},] 



Assigning topics:  51%|█████     | 1023/2000 [1:46:24<1:36:17,  5.91s/it]


Error parsing JSON for batch starting at index 5110: Expecting value: line 1 column 278 (char 277)
Raw response: [{"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Learning Curve", "sentiment": "Neutral"}, {"topic": "Compatibility", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Negative"},  ] 



Assigning topics:  52%|█████▏    | 1045/2000 [1:48:44<1:28:06,  5.54s/it]


Error parsing JSON for batch starting at index 5220: Expecting value: line 1 column 238 (char 237)
Raw response: [{"topic": "Sound & Performance", "sentiment": "Negative"}, {"topic": "Technical Aspects", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Comfort & Design", "sentiment": "Negative"},] 



Assigning topics:  53%|█████▎    | 1066/2000 [1:51:10<1:40:36,  6.46s/it]


Error parsing JSON for batch starting at index 5325: Expecting value: line 1 column 295 (char 294)
Raw response: [{"topic": "Customer Experience", "sentiment": "Negative"}, {"topic": "Product Quality", "sentiment": "Negative"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Technical Aspects", "sentiment": "Negative"}, {"topic": "Functionality & Usability", "sentiment": "Neutral"},] 



Assigning topics:  61%|██████    | 1218/2000 [2:07:04<1:16:27,  5.87s/it]


Error parsing JSON for batch starting at index 6085: Expecting ':' delimiter: line 1 column 316 (char 315)
Raw response: [{"topic": "Brand Perception", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Neutral"}, {"topic": "Customer Experience", "sentiment": "Negative"}, {"topic": "Shipping & Delivery", "sentiment": "Positive", "Usage Experience", "sentiment": "Positive"}] 



Assigning topics:  64%|██████▍   | 1282/2000 [2:13:39<1:05:48,  5.50s/it]


Error parsing JSON for batch starting at index 6405: Expecting value: line 1 column 292 (char 291)
Raw response: [{"topic": "User Experience", "sentiment": "Positive"}, {"topic": "Price & Value", "sentiment": "Positive"}, {"topic": "Sound & Performance", "sentiment": "Positive"}, {"topic": "Functionality & Usability", "sentiment": "Positive"}, {"topic": "Customer Experience", "sentiment": "Negative"},] 



Assigning topics:  69%|██████▉   | 1387/2000 [2:24:55<1:03:04,  6.17s/it]


Error parsing JSON for batch starting at index 6930: Expecting value: line 1 column 271 (char 270)
Raw response: [{"topic": "Technical Aspects", "sentiment": "Positive"}, {"topic": "Product Quality", "sentiment": "Negative"}, {"topic": "Durability", "sentiment": "Negative"}, {"topic": "Sound & Performance", "sentiment": "Negative"}, {"topic": "Aesthetics", "sentiment": "Neutral"},] 



Assigning topics:  73%|███████▎  | 1463/2000 [2:33:05<56:11,  6.28s/it]  


KeyboardInterrupt: 

In [46]:
amazon_reviews_with_topics

NameError: name 'amazon_reviews_with_topics' is not defined

In [14]:
amazon_reviews_with_topics.to_csv('outputs/amazon_reviews_with_topics_10000.csv', index=False)

In [15]:
amazon_reviews_with_topics['Topic'].value_counts()

Topic
Learning Experience    44
User Satisfaction      19
Product Quality         6
Unknown                 1
Name: count, dtype: int64

In [19]:
amazon_reviews_with_topics.to_excel('outputs/amazon_reviews_with_topics.xlsx', index=False)

In [20]:
amazon_reviews_with_topics['Sentiment'].value_counts()

Sentiment
Positive    58
Negative     7
Neutral      4
Unknown      1
Name: count, dtype: int64

In [21]:
amazon_reviews_with_topics[amazon_reviews_with_topics['Sentiment'] != 'Positive']


Unnamed: 0,Reviews,Ratings,Topic,Sentiment
11,Not really current.,3.0,Product Quality,Negative
14,Good reference book,5.0,Product Quality,Neutral
17,I'm only giving this book three stars because ...,3.0,Learning Experience,Negative
23,still can't crochet to save my life,4.0,Learning Experience,Negative
26,:-),5.0,Learning Experience,Neutral
45,Great book I just wish the pics were more in c...,5.0,Product Quality,Negative
47,I TRIED BUT I AM BETTER AT KNITTING,5.0,Learning Experience,Negative
52,still learning but having fun doing it,5.0,Learning Experience,Neutral
62,Thanks,5.0,User Satisfaction,Neutral
67,I have never really used it. I don't really ge...,2.0,User Satisfaction,Negative
