**Extracting product info from texts with LLM**

Karina Tiurina

Salveen Dutt

Patryk Prusak

#### Inputs & Consts

In [58]:
import json
from bert_score import score
from transformers import pipeline
import accelerate
import bitsandbytes
import torch
import pickle

#### Define data

In [13]:
with open("products.json", "r") as file:
    products = json.load(file)

In [3]:
products["products"]["Electric Bike"]["reviews"][0]["review_content"]

'The main thing: on the battery itself, driving calmly, not exceeding 30 km/h, you can do 60-70km.'

#### Comparison metrics

In [None]:
def compare_product_review_similarity(review_data, product_data):
    """
    Calculates a similarity score between a review and a product based on
    categories, brands, and keywords, including a comparison of full product
    title with review information using BERTScore.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # String Comparison (BERTScore) between Product Title and Review Data
    review_info_string = " ".join(
        [
            review_data.get("product category"),
            review_data.get("brand"),
            " ".join(review_data.get("other keywords")),
        ]
    )

    product_info_string = " ".join(
        [
            product_data.get("product category"),
            product_data.get("brand"),
            " ".join(product_data.get("other keywords")),
        ]
    )

    P, R, F1 = score(
        [product_info_string],
        [review_info_string],
        lang="en",
        model_type="bert-base-uncased",
        verbose=False,
    )

    print(F1.mean().item())
    similarity_score += F1.mean().item() * 0.2

    return round(min(1, similarity_score) * 100)


def bert_score(review_data, product_data):
    """
    Calculates the BERTScore between a review and a product based on
    categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate BERTScore
    P, R, F1 = score(
        [review_keywords],
        [product_keywords],
        lang="en",
        model_type="bert-base-uncased",
        verbose=False,
    )

    similarity_score += F1.mean().item() * 0.2

    return round(min(1, similarity_score) * 100)


def cosine_similarity(review_data, product_data):
    """
    Calculates the cosine similarity between a review and a product based on
    categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Cosine Similarity
    similarity_score += (
        bitsandbytes.cosine_similarity(review_keywords, product_keywords) * 0.2
    )

    return round(min(1, similarity_score) * 100)


def levenshtein_distance(review_data, product_data):
    """
    Calculates the Levenshtein distance between a review and a product based on
    categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Levenshtein Distance
    similarity_score += (
        bitsandbytes.levenshtein_distance(review_keywords, product_keywords) * 0.2
    )

    return round(min(1, similarity_score) * 100)


def needleman_wunsch(review_data, product_data):
    """
    Calculates the Needleman-Wunsch similarity between a review and a product
    based on categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Needleman-Wunsch Similarity
    similarity_score += (
        bitsandbytes.needleman_wunsch(review_keywords, product_keywords) * 0.2
    )

    return round(min(1, similarity_score) * 100)


def smith_waterman(review_data, product_data):
    """
    Calculates the Smith-Waterman similarity between a review and a product
    based on categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if review_data.get("product category") == product_data.get("product category"):
        similarity_score += 0.5

    # Brand Matching (Medium weight)
    if review_data.get("brand") == product_data.get("brand"):
        similarity_score += 0.3

    # Keyword Matching (Lowest weight)
    review_keywords = review_data.get("other keywords")
    product_keywords = product_data.get("other keywords")

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Smith-Waterman Similarity
    similarity_score += (
        bitsandbytes.smith_waterman(review_keywords, product_keywords) * 0.2
    )

    return round(min(1, similarity_score) * 100)

In [None]:
similarity_metrics = [
    smith_waterman,
    needleman_wunsch,
    levenshtein_distance,
    cosine_similarity,
    bert_score,
    compare_product_review_similarity,
]

In [21]:
# define prompt template
prompt_template = """
You are an assistant, helping in understanding of reviews. Carefully read the review:
{content}

Return json format with the following JSON schema:

{{
        "product category": {{
            "type": "string",
            "enum": ["Electric bicycle", "Refrigirator", "The Blocks", "Others"]
        }},
        "brand": {{
            "type": "string" or N/A
        }},
        "other keywords": {{
            "type": "array",
            "items": {{
                "type": "string"
            }}
        }},

}}
"""

#### Execution

In [None]:
models = [
    "Qwen/Qwen2.5-7B-Instruct",
    "Qwen/Qwen2.5-1.5B",
    "google/gemma-2-9b",
    "meta-llama/Llama-3.2-3B-Instruct",
]

In [60]:
responses = []

In [None]:
for model in models:
    try:
        nlp = pipeline(
            "text-generation",
            model=model,
            model_kwargs={
                "quantization_config": {
                    "load_in_4bit": True,
                },
            },
        )
    except Exception as e:
        print(f"Failed to load model {model}, error: {e}")
        continue
    for productType in products["products"].keys():
        for review in products["products"][productType]["reviews"]:
            try:
                reviews_content = review["review_content"]

                prompt = prompt_template.format(content=reviews_content)
                messages = [
                    {"role": "user", "content": prompt},
                ]
                response = nlp(messages, max_length=1024, num_return_sequences=1)

                responses.append(
                    [response[0]["generated_text"][1]["content"], review, model]
                )

            except Exception as e:
                print(
                    f"Failed to generate response for {productType} review, error: {e}"
                )
                continue

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.33s/it]


In [None]:
responses[0].replace("```json\n", "").replace("\n```", "")

['{\n    "product category": "Electric bicycle",\n    "brand": "N/A",\n    "other keywords": [\n        "battery performance",\n        "speed limit",\n        "distance range"\n    ]\n}',
 {'review_content': 'The main thing: on the battery itself, driving calmly, not exceeding 30 km/h, you can do 60-70km.',
  'golden_answer': {'product_category': 'Electic bicycle',
   'other_keywords': ['battery', 'calmly']}},
 'Qwen/Qwen2.5-7B-Instruct']

In [71]:
with open("responses.pkl", "wb") as f:
    pickle.dump(responses, f)

In [None]:
# for metric in similarity_metrics:
#     response_json = json.loads(response[0]['generated_text'][1]['content'].replace('```json\n', '').replace('\n```', ''))
#     # similarity_score = metric(response_json, review)
#     # responses[-1].append(similarity_score)

#### Results

In [None]:
# create a table from responses, maybe a couple barplots
# create a pandas df from responses