**NLP - Extracting product info from texts with LLM**

- *Karina Tiurina*
- *Salveen Dutt*
- *Patryk Prusak*


Metrics used:

1. Smith-Waterman
2. Needleman-Wunsch
3. Levenshtein Distance
4. Cosine Similarity
5. Bert Score
6. Custom Metric

Models used:

1. Qwen2.5-7B-Instruct
2. Qwen2.5-1.5B
3. gemma-2-9b
4. Llama-3.2-3B-Instruct

#### Inputs & Consts

In [86]:
import json
from bert_score import score
from transformers import pipeline
import accelerate
import bitsandbytes
import torch
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(palette="cubehelix")

#### Define data

In [87]:
with open("products.json", "r") as file:
    products = json.load(file)

In [88]:
products["products"]["Electric Bike"]["reviews"][0]["review_content"]

'The main thing: on the battery itself, driving calmly, not exceeding 30 km/h, you can do 60-70km.'

#### Comparison metrics

In [89]:
def compare_product_review_similarity(review_data, product_data):
    """
    Calculates a similarity score between a review and a product based on
    categories, brands, and keywords, including a comparison of full product
    title with review information using BERTScore.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # String Comparison (BERTScore) between Product Title and Review Data
    review_info_string = " ".join(
        [
            review_data.get("product category"),
            review_data.get("brand"),
            " ".join(review_data.get("other keywords")),
        ]
    )

    product_info_string = " ".join(
        [
            product_data.get("product category"),
            product_data.get("brand"),
            " ".join(product_data.get("other keywords")),
        ]
    )

    P, R, F1 = score(
        [product_info_string],
        [review_info_string],
        lang="en",
        model_type="bert-base-uncased",
        verbose=False,
    )

    print(F1.mean().item())
    similarity_score += F1.mean().item() * 0.2

    return round(min(1, similarity_score) * 100)


def bert_score(review_data, product_data):
    """
    Calculates the BERTScore between a review and a product based on
    categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate BERTScore
    P, R, F1 = score(
        [review_keywords],
        [product_keywords],
        lang="en",
        model_type="bert-base-uncased",
        verbose=False,
    )

    similarity_score += F1.mean().item() * 0.2

    return round(min(1, similarity_score) * 100)


def cosine_similarity(review_data, product_data):
    """
    Calculates the cosine similarity between a review and a product based on
    categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Cosine Similarity
    similarity_score += (
        bitsandbytes.cosine_similarity(review_keywords, product_keywords) * 0.2
    )

    return round(min(1, similarity_score) * 100)


def levenshtein_distance(review_data, product_data):
    """
    Calculates the Levenshtein distance between a review and a product based on
    categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Levenshtein Distance
    similarity_score += (
        bitsandbytes.levenshtein_distance(review_keywords, product_keywords) * 0.2
    )

    return round(min(1, similarity_score) * 100)


def needleman_wunsch(review_data, product_data):
    """
    Calculates the Needleman-Wunsch similarity between a review and a product
    based on categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Needleman-Wunsch Similarity
    similarity_score += (
        bitsandbytes.needleman_wunsch(review_keywords, product_keywords) * 0.2
    )

    return round(min(1, similarity_score) * 100)


def smith_waterman(review_data, product_data):
    """
    Calculates the Smith-Waterman similarity between a review and a product
    based on categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Smith-Waterman Similarity
    similarity_score += (
        bitsandbytes.smith_waterman(review_keywords, product_keywords) * 0.2
    )

    return round(min(1, similarity_score) * 100)

In [90]:
similarity_metrics = [
    smith_waterman,
    needleman_wunsch,
    levenshtein_distance,
    cosine_similarity,
    bert_score,
    compare_product_review_similarity,
]

In [91]:
# define prompt template
prompt_template = """
You are an assistant, helping in understanding of reviews. Carefully read the review:
{content}

Return json format with the following JSON schema:

{{
        "product category": {{
            "type": "string",
            "enum": ["Electric bicycle", "Refrigirator", "The Blocks", "Others"]
        }},
        "brand": {{
            "type": "string" or N/A
        }},
        "other keywords": {{
            "type": "array",
            "items": {{
                "type": "string"
            }}
        }},

}}
"""

#### Execution

In [92]:
models = [
    "Qwen/Qwen2.5-7B-Instruct",
    "Qwen/Qwen2.5-1.5B",
    "google/gemma-2-9b",
    "meta-llama/Llama-3.2-3B-Instruct",
]

In [93]:
responses = []

In [None]:
for model in models:
    try:
        nlp = pipeline(
            "text-generation",
            model=model,
            model_kwargs={
                "quantization_config": {
                    "load_in_4bit": True,
                },
            },
        )
    except Exception as e:
        print(f"Failed to load model {model}, error: {e}")
        continue
    for productType in products["products"].keys():
        print(f"Generating responses for {productType} reviews using {model}...")
        for review in products["products"][productType]["reviews"]:
            try:
                reviews_content = review["review_content"]

                prompt = prompt_template.format(content=reviews_content)
                messages = [
                    {"role": "user", "content": prompt},
                ]
                response = nlp(messages, max_length=1024, num_return_sequences=1)

                responses.append(
                    [response[0]["generated_text"][1]["content"], review, model]
                )

            except Exception as e:
                print(
                    f"Failed to generate response for {productType} review, error: {e}"
                )
                continue

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:30<00:00,  7.68s/it]


Generating responses for Electric Bike reviews using Qwen/Qwen2.5-7B-Instruct...
Generating responses for Refrigirator reviews using Qwen/Qwen2.5-7B-Instruct...
Generating responses for The LEGO reviews using Qwen/Qwen2.5-7B-Instruct...


`low_cpu_mem_usage` was None, now default to True since model is quantized.
Both `max_new_tokens` (=2048) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generating responses for Electric Bike reviews using Qwen/Qwen2.5-1.5B...


In [None]:
json.parse(responses[0].replace("```json\n", "").replace("\n```", ""))

['{\n    "product category": "Electric bicycle",\n    "brand": "N/A",\n    "other keywords": [\n        "battery performance",\n        "speed limit",\n        "distance range"\n    ]\n}',
 {'review_content': 'The main thing: on the battery itself, driving calmly, not exceeding 30 km/h, you can do 60-70km.',
  'golden_answer': {'product_category': 'Electic bicycle',
   'other_keywords': ['battery', 'calmly']}},
 'Qwen/Qwen2.5-7B-Instruct']

In [None]:
with open("responses.pkl", "wb") as f:
    pickle.dump(responses, f)

In [None]:
for response in responses:
    try:
        llm_response = response[0]
        llm_response = json.parse(llm_response.replace("```json\n", "").replace("\n```", ""))
        score =[metric(llm_response, response[1]) for metric in similarity_metrics]
        response = response + score
    except Exception as e:
        print(f"Failed to calculate similarity score, error: {e}")
        continue

#### Results

In [None]:
columns = ["response", "review", "model"] + [metric.__name__ for metric in similarity_metrics]
results_df = pd.DataFrame(responses, columns=columns)

In [None]:
# To plot the barplots, we need to transform the results, each row should contain a model name, a metric name, and the similarity score
# Then plot x-axis as model name, y-axis as similarity score, and hue as metric name


results_transformed = results_df.melt(
    id_vars=["response", "review", "model"],
    var_name="similarity_metric",
    value_name="similarity_score",
)