In [1]:
from huggingface_hub import login
import pandas as pd
from transformers import LlamaTokenizer, AutoModelForCausalLM
import torch
import ipywidgets as widgets
import re
import random
from IPython.display import display

# Log in to Hugging Face with your token
huggingface_token = "Add your token here"
login(token=huggingface_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
# install ipwidgets if needed
#! pip install ipywidgets


[0m

In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# Use LlamaTokenizer for Mistral models, as Mistral is compatible with LLaMA tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure the model is loaded onto the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32768, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

In [4]:
torch.cuda.empty_cache()

In [5]:
# Load the CSV file and drop unnecessary columns
csv_file = "filtered_data_predictions_clusters.csv"
df = pd.read_csv(csv_file)
df = df.drop(columns=["brand", "categories", "primaryCategories", "reviews.date"])

In [6]:
# Define cluster names for user-friendly selection
cluster_names = {
    0: "Smart Speakers",
    1: "Pet Supplies & AAA Batteries",
    2: "Fire Tablets & Streaming Devices with Alexa",
    3: "Fire Tablets - Standard Editions",
    4: "Fire Tablets - Kids Editions",
    5: "Fire Tablets with Alexa",
    6: "Alexa Devices & Accessories",
    7: "AA Batteries",
    8: "Kindle E-Readers & Accessories",
    9: "Echo Devices - Various Generations"
}

# Add cluster name column
df['category_name'] = df['cluster'].map(cluster_names)

In [8]:
#specific promptreview summaries
review_prompt_template = "Provide a two-sentence summary of the main strengths and weaknesses for this product based on user reviews: {}"

In [36]:
def top_product_summaries(results):
    print("\n Here are the top products of the category:")
    print("=" * 60)

    for i, entry in enumerate(results, 1):
        print(f"\nProduct Name: {entry['product_name']}")
        print(f"Title Summary:\n  {entry['title_summary']}")
        print(f"Review Summary:\n  {entry['review_summary']}")
        print("=" * 60)

In [10]:
def generate_summary(prompt, max_new_tokens=20, temperature=0.5, repetition_penalty=1.8, is_title=False):
    # Adjust parameters for titles
    if is_title:
        max_new_tokens = 50  # Increase slightly for more complete titles
        repetition_penalty = 1.8  # Reduce redundancy
        temperature=5

    # Generate title or summary
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    summary_ids = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        num_beams=3,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and return output
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()

In [11]:
# Synonyms for variety
synonyms = {
    "great": ["excellent", "outstanding", "impressive"],
    "good": ["satisfactory", "decent", "suitable"],
    "easy": ["straightforward", "simple", "user-friendly"],
    "awesome": ["fantastic", "remarkable", "incredible"],
    "price": ["cost", "value", "affordability"],
}
def post_process(text):
    # Patterns for prompt artifacts
    prompt_patterns = [
        r"^Create a one-sentence title summarizing.*:?\s*",
        r"^Provide a two-sentence summary.*:?\s*",
        r"^Focus on unique qualities only.*:?\s*",
        r"^Generate a 5-word title that summarizes the key.*:?\s",
        r"^.*summarizing.*:\s*",  # Remove 'summarizing' patterns
        r"^.*summary.*:\s*",      # Remove 'summary' patterns
        r"^.*qualities only.*:\s*",  # Remove 'qualities only' patterns
        r"^.*summarizes the key.*:\s*",  # Remove 'summarizes the key' patterns
        r"^Strengths:\s*",  # Remove 'Strengths:' leading label
        r"^Weaknesses:\s*",  # Remove 'Weaknesses:' leading label
        r"\s*Strengths:\s*",  # Remove 'Strengths:' inline label
        r"\s*Example:\s*"  
        r"\s*Review:\s*"
        r"^Examples?:\s*",          # Lines starting with "Example:" or "Examples:"
        r"^- Review\s*:\s*",        # Lines starting with "- Review:"
        r"^Main Weaknesses?:\s*",   # Lines starting with "Main Weaknesses:"
        r"^\d+\.",                  # Removes any numbered lists at the start of a line (e.g., "1.")
        r"^-",                      # Removes lines starting with a dash or hyphen
        r"^\s*"                     # Remove extra whitespace-only lines    
    ]
    
    # Remove initial prompt text and track changes
    for pattern in prompt_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE).strip()
    
    # Ensure text ends at the last complete sentence
    last_period_index = text.rfind(".")
    if last_period_index != -1:
        text = text[:last_period_index + 1]   
        
    # Split into sentences and filter out fragments
    sentences = re.split(r'(?<=\w[.!?])\s+', text)
    sentences = [s for s in sentences if s and len(s.split()) > 3]
    
    # Synonym replacement for variety
    for i, sentence in enumerate(sentences):
        for word, syn_list in synonyms.items():
            if word in sentence:
                synonym = random.choice(syn_list)
                sentence = re.sub(r'\b{}\b'.format(word), synonym, sentence, flags=re.IGNORECASE)
        sentences[i] = sentence
    
    # Add contextual phrases if applicable
    final_sentences = []
    for sentence in sentences:
        if "kids" in sentence.lower() or "children" in sentence.lower():
            sentence += " Ideal for young users due to its durable and easy-to-use design."
        elif "waterproof" in sentence.lower() or "beach" in sentence.lower():
            sentence += " Perfect for reading outdoors or by the pool."
        final_sentences.append(sentence)
    
    # Recombine the cleaned and enhanced sentences
    processed_text = " ".join(final_sentences)
    
    return processed_text

In [12]:
def post_process_title(text):
    # Locate the last occurrence of "Title:" and keep everything that follows
    last_title_index = text.rfind("Title:")
    if last_title_index != -1:
        text = text[last_title_index + len("Title:"):].strip()

    # Split text into lines for more precise line-by-line processing
    lines = text.splitlines()
    
    # Patterns to remove unwanted artifacts from the title section
    prompt_patterns = [
        r"^Examples?:\s*$",              # Removes "Example:" or "Examples:" lines
        r"^- Review\s*:\s*$",            # Removes "- Review:" lines
        r"^Main Weaknesses?:\s*$",       # Removes "Main Weaknesses:" lines
        r"^Strengths?:\s*$",             # Removes "Strengths:" lines
        r"^\d+\.\s*$",                   # Removes any numbered lists (e.g., "1. ")
        r"^-\s*$",                       # Removes lines starting with "-"
        r"^\s*$"                         # Removes empty lines
    ]

    # Remove lines that match any pattern
    cleaned_lines = []
    for line in lines:
        match = False
        for pattern in prompt_patterns:
            if re.match(pattern, line.strip(), re.IGNORECASE):
                match = True
                break
        if not match:
            cleaned_lines.append(line.strip())

    # Join the cleaned lines back into a single text block
    processed_text = " ".join(cleaned_lines).strip()

    # Remove any quotes and make sure to end with a complete sentence
    processed_text = processed_text.strip('\'"')
    processed_text = processed_text.rsplit('.', 1)[0] + '.'  # Ensure it ends with a complete sentence.

    return processed_text.strip()


In [13]:
def generate_summary_again(review_text, max_new_tokens=80):
    # Set different generation parameters for more diversity and flexibility
    temperature = 1.0  # Higher temperature for more varied responses
    repetition_penalty = 1.2  # Lower penalty to allow a bit of repetition for emphasis

    inputs = tokenizer(review_text, return_tensors="pt", truncation=True).to(device)
    summary_ids = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        num_beams=4,  # Increase beams for more options
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()

In [37]:
# Create a dropdown widget for category selection
dropdown = widgets.Dropdown(
    options=[(name, idx) for idx, name in cluster_names.items()],
    description='Category:',
    style={'description_width': 'initial'},
    layout={'width': 'max-content'}
)

# Display the dropdown and retrieve the selected category
display(dropdown)

# Function to get selected category after user selects from dropdown
def on_category_selected(change):
    selected_category = change['new']
    selected_category_name = cluster_names[selected_category]
    print("Dropdown selected value:", change['new'])  # Debugging line
    
    # Filter data for selected category
    category_df = df[df['category_name'] == selected_category_name]
    
    # Implement filtering logic for top products based on your criteria
    filtered_df = category_df[
        (category_df['reviews.rating'].isin([1, 2, 3, 4, 5])) &
        (category_df['predicted_sentiment'] == category_df['sentiment']) &
        (
            ((category_df['reviews.rating'].isin([1, 2])) & (category_df['sentiment'] == 'negative')) |
            ((category_df['reviews.rating'] == 3) & (category_df['sentiment'] == 'neutral')) |
            ((category_df['reviews.rating'].isin([4, 5])) & (category_df['sentiment'] == 'positive'))
        )
    ]
    # Sort by appearances and rating, then select top 3 unique products by name
    top_products = filtered_df.sort_values(
        by=['appearances', 'reviews.rating'], 
        ascending=[False, False]
    ).drop_duplicates(subset='name').head(3)
    
    # Generate summaries for the top products
    results = []
    for _, row in top_products.iterrows():
        product_name = row["name"]
        combined_text = row["reviews.title"] + " " + row["reviews.text"]
        review_prompt = review_prompt_template.format(combined_text[:150])
        review_summary = generate_summary(review_prompt, max_new_tokens=60)
        review_summary = post_process(review_summary)

        # Check if summary is empty and regenerate if needed
        if not review_summary:
            # Use raw reviews text for regeneration to get a fresh perspective
            review_prompt_2 = review_prompt_template.format(combined_text[150:300])
            #review_text = row["reviews.text"][:500]  # Use a larger chunk of raw text
            review_summary = generate_summary(review_prompt_2, max_new_tokens=80)
            print(product_name)
            # Optional: Further post-process the regenerated summary if needed
            review_summary = post_process(review_summary)

        # Use an example-enhanced prompt for title generation
        example_titles = """
        Examples:
        - Review: "This tablet has amazing battery life and is very user-friendly."
        Title: "Long Battery Life and Easy to Use"
        - Review: "The Kindle Fire is perfect for kids, durable, and affordable."
        Title: "Kid-Friendly, Durable, and Affordable"
        - Review: "Echo Show's sound quality is fantastic, but privacy concerns remain."
        Title: "Great Sound, But Privacy Concerns"
        """

        title_prompt_from_text = f"""{example_titles}
        Create a concise and catchy title that captures the main idea of this review: {review_summary} Title:
        """
        # Generate title based on the review summary with examples in the prompt
        title_summary_raw = generate_summary(title_prompt_from_text, is_title=True)
        #print("Title Before post-processing:", title_summary_raw)

        title_summary = post_process_title(title_summary_raw)
        #print("Title after post-processing:", title_summary)

        # Store results
        results.append({
            "product_name": product_name,
            "title_summary": title_summary,
            "review_summary": review_summary
        })
    top_product_summaries(results)

# Link the dropdown selection to the function
dropdown.observe(on_category_selected, names='value')


Dropdown(description='Category:', layout=Layout(width='max-content'), options=(('Smart Speakers', 0), ('Pet Su…

Dropdown selected value: 4
fire kids edition tablet 7 display wifi 16 gb pink kidproof case

 Here are the top products of the category:

Product Name: fire kids edition tablet 7 display wifi 16 gb pink kidproof case
Title Summary:
  Travel-Friendly, Kid-Proof Tablet.
Review Summary:
  The main strength of this product, according to user reviews, is that it is loved by 2-year-olds, making it a excellent tablet for kids when traveling. Ideal for young users due to its durable and easy-to-use design.

Product Name: fire kids edition tablet 7 display wifi 16 gb pink kidproof case
Title Summary:
  User-Friendly and Durable: The Perfect E-Reader for Kids.
Review Summary:
  Based on user reviews, the main strength of the Kindle is its user-friendly interface, making it an ideal choice for children or those new to e-readers. Ideal for young users due to its durable and easy-to-use design.

Product Name: fire kids edition tablet 7 display wifi 16 gb pink kidproof case
Title Summary:
  A Perfec