**NLP - Extracting product info from texts with LLM**

- *Karina Tiurina*
- *Salveen Dutt*
- *Patryk Prusak*

Comparison of various NLP models in the task of classifying reviews to specific product types, extracting product keywords and attributes.


Metrics used:

1. Smith-Waterman
2. Needleman-Wunsch
3. Levenshtein Distance
4. Cosine Similarity
5. Bert Score
6. Custom Metric

Models used:

1. Qwen2.5-1.5B
2. gemma-2-2b
3. Llama-3.2-3B-Instruct

#### Inputs & Consts

In [1]:
import json
from bert_score import score
from transformers import pipeline
import accelerate
import bitsandbytes
import torch
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import Levenshtein
from Bio import pairwise2
from scipy.spatial.distance import cosine

from sklearn.metrics.pairwise import cosine_similarity

sns.set_theme(palette="cubehelix")

  from .autonotebook import tqdm as notebook_tqdm


#### Define data

In [2]:
with open("products.json", "r") as file:
    products = json.load(file)

In [3]:
products["products"]["Electric Bike"]["reviews"][0]["review_content"]

'The main thing: on the battery itself, driving calmly, not exceeding 30 km/h, you can do 60-70km.'

#### Comparison metrics

In [4]:
def compare_product_review_similarity(original_data, llm_answer):
    """
    Calculates a similarity score between provided data and a llm answer based on
    categories, brands, and keywords, including a comparison of full product
    title with review information using BERTScore.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if (
        original_data["golden_answer"]["product_category"]
        == llm_answer["product_category"]["type"]
    ):
        similarity_score += 0.5

    # String Comparison (BERTScore) between Product Title and Review Data
    review_info_string = " ".join(
        [
            original_data["golden_answer"]["product_category"],
            " ".join(original_data["golden_answer"]["other_keywords"]),
        ]
    )

    product_info_string = " ".join(
        [
            llm_answer["product_category"]["type"],
            " ".join(llm_answer["other_keywords"]),
        ]
    )

    P, R, F1 = score(
        [product_info_string],
        [review_info_string],
        lang="en",
        model_type="bert-base-uncased",
        verbose=False,
    )

    print(F1.mean().item())
    similarity_score += F1.mean().item() * 0.2

    return round(min(1, similarity_score) * 100)


def bert_score(original_data, llm_answer):
    """
    Calculates the BERTScore between provided data and a llm answer based on
    categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if (
        original_data["golden_answer"]["product_category"]
        == llm_answer["product_category"]["type"]
    ):
        similarity_score += 0.5

    # Keyword Matching (Lowest weight)
    review_keywords = original_data["golden_answer"]["other_keywords"]
    product_keywords = llm_answer["other_keywords"]

    if not review_keywords or not product_keywords:
        return 0

    # Calculate BERTScore
    P, R, F1 = score(
        ["".join(review_keywords)],
        ["".join(product_keywords)],
        lang="en",
        model_type="bert-base-uncased",
        verbose=False,
    )

    similarity_score += F1.mean().item() * 0.2

    return round(min(1, similarity_score) * 100)


def cosine_similarity_score(original_data, llm_answer):
    """
    Calculates the cosine similarity between provided data and a llm answer based on
    categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if (
        original_data["golden_answer"]["product_category"]
        == llm_answer["product_category"]["type"]
    ):
        similarity_score += 0.5

    # Keyword Matching (Lowest weight)
    review_keywords = original_data["golden_answer"]["other_keywords"]
    product_keywords = llm_answer["other_keywords"]

    # Attribute Matching (Medium weight)
    review_attributes = original_data["golden_answer"]["product_attributes"]
    product_attributes = llm_answer["product_attributes"]

    if not review_keywords or not product_keywords:
        return 0

    similarity_score += (
        1 - cosine(" ".join(review_keywords), " ".join(product_keywords))
    ) * 0.2 + (
        1 - cosine(" ".join(review_attributes), " ".join(product_attributes))
    ) * 0.3

    return round(min(1, similarity_score) * 100)


def levenshtein_distance(original_data, llm_answer):
    """
    Calculates the Levenshtein distance between provided data and a llm answer based on
    categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if (
        original_data["golden_answer"]["product_category"]
        == llm_answer["product_category"]["type"]
    ):
        similarity_score += 0.5

    # Keyword Matching (Lowest weight)
    review_keywords = original_data["golden_answer"]["other_keywords"]
    product_keywords = llm_answer["other_keywords"]

    # Attribute Matching (Medium weight)
    review_attributes = original_data["golden_answer"]["product_attributes"]
    product_attributes = llm_answer["product_attributes"]

    if not review_keywords or not product_keywords:
        return 0

    similarity_score += (
        Levenshtein.distance(" ".join(review_keywords), " ".join(product_keywords))
        * 0.2
    )
    +(
        Levenshtein.distance(" ".join(review_attributes), " ".join(product_attributes))
        * 0.3
    )

    return round(min(1, similarity_score) * 100)


def needleman_wunsch_similarity(seq1, seq2):
    """
    Calculates the Needleman-Wunsch similarity between two sequences.
    """
    alignments = pairwise2.align.globalxx(seq1, seq2)
    max_score = max(alignment.score for alignment in alignments)
    return max_score


def needleman_wunsch(original_data, llm_answer):
    """
    Calculates the Needleman-Wunsch similarity between provided data and a llm answer
    based on categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if (
        original_data["golden_answer"]["product_category"]
        == llm_answer["product_category"]["type"]
    ):
        similarity_score += 0.5

    # Keyword Matching (Lowest weight)
    review_keywords = original_data["golden_answer"]["other_keywords"]
    product_keywords = llm_answer["other_keywords"]

    review_attributes = original_data["golden_answer"]["product_attributes"]
    product_attributes = llm_answer["product_attributes"]

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Needleman-Wunsch Similarity
    similarity_score += (
        needleman_wunsch_similarity(
            " ".join(review_keywords), " ".join(product_keywords)
        )
        * 0.2
    )
    +(
        needleman_wunsch_similarity(
            " ".join(review_attributes), " ".join(product_attributes)
        )
        * 0.3
    )

    return round(min(1, similarity_score) * 100)


def smith_waterman_similarity(seq1, seq2):
    """
    Calculates the Smith-Waterman similarity between two sequences.
    """
    match = 2
    mismatch = -1
    gap = -1

    # Initialize the scoring matrix
    m, n = len(seq1), len(seq2)
    score_matrix = [[0] * (n + 1) for _ in range(m + 1)]
    max_score = 0

    # Fill the scoring matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if seq1[i - 1] == seq2[j - 1]:
                score = match
            else:
                score = mismatch
            score_matrix[i][j] = max(
                0,
                score_matrix[i - 1][j - 1] + score,
                score_matrix[i - 1][j] + gap,
                score_matrix[i][j - 1] + gap,
            )
            max_score = max(max_score, score_matrix[i][j])

    return max_score


def smith_waterman(original_data, llm_answer):
    """
    Calculates the Smith-Waterman similarity between provided data and a llm answer
    based on categories, brands, and keywords.
    """

    similarity_score = 0

    # Category Matching (Highest weight)
    if (
        original_data["golden_answer"]["product_category"]
        == llm_answer["product_category"]["type"]
    ):
        similarity_score += 0.5

    # Keyword Matching (Lowest weight)
    review_keywords = original_data["golden_answer"]["other_keywords"]
    product_keywords = llm_answer["other_keywords"]

    # Attribute Matching (Medium weight)
    review_attributes = original_data["golden_answer"]["product_attributes"]
    product_attributes = llm_answer["product_attributes"]

    if not review_keywords or not product_keywords:
        return 0

    # Calculate Smith-Waterman Similarity
    similarity_score += (
        smith_waterman_similarity(review_keywords, product_keywords) * 0.2
    )
    +(smith_waterman_similarity(review_attributes, product_attributes) * 0.3)

    return round(min(1, similarity_score) * 100)

#### Execution

In [5]:
# define prompt template
prompt_template = """
You are an assistant, helping in understanding of reviews. Carefully read the review:
{content}

Return json format with the following JSON schema:

{{
        "product_category": {{
            "type": "string",
            "enum": ["Electric bicycle", "Refrigirator", "The Blocks", "Others"]
        }},
        "product_attributes": {{
            "type": "array",
            "items": {{
                "type": "string"
            }}
        }},
        "other_keywords": {{
            "type": "array",
            "items": {{
                "type": "string"
            }}
        }},

}}
"""

In [6]:
# partly inspired by bioinformatics (https://en.wikipedia.org/wiki/Sequence_alignment)
similarity_metrics = [
    smith_waterman,
    needleman_wunsch,
    levenshtein_distance,
    cosine_similarity,
    bert_score,
    compare_product_review_similarity,
]

In [7]:
# Inspiration for model selection taken from https://huggingface.co/collections/open-llm-leaderboard/open-llm-leaderboard-best-models-652d6c7965a4619fb5c27a03

models = [
    "Qwen/Qwen2.5-7B-Instruct",
    "Qwen/Qwen2.5-1.5B",
    "google/gemma-2-9b",
    "google/gemma-2-2b",
    "meta-llama/Llama-3.2-3B-Instruct",
]

In [20]:
responses = []
nlp = None

In [None]:
for model in models:
    try:

        # load in 4bit greatly reduces the memory usage
        nlp = pipeline(
            "text-generation",
            model=model,
            model_kwargs={
                "quantization_config": {
                    "load_in_4bit": True,
                },
            },
        )

    except Exception as e:
        print(f"Failed to load model {model}, error: {e}")
        continue
    for productType in products["products"].keys():
        print(f"Generating responses for {productType} reviews using {model}...")
        for review in products["products"][productType]["reviews"]:
            try:
                reviews_content = review["review_content"]

                prompt = prompt_template.format(content=reviews_content)
                messages = [
                    {"role": "user", "content": prompt},
                ]
                response = nlp(messages, max_new_tokens=1024, num_return_sequences=1)
                responses.append(
                    [response[0]["generated_text"][1]["content"], review, model]
                )

            except Exception as e:
                print(
                    f"Failed to generate response for {productType} review, error: {e}"
                )
                continue
        try:
            # the memory is probably freed up, but to be safe we delete the nlp object and empty the cache
            del nlp
            torch.cuda.empty_cache()
        except Exception as e:
            print(f"Failed to delete nlp object, error: {e}")
            continue

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.15s/it]


Generating responses for Electric Bike reviews using Qwen/Qwen2.5-7B-Instruct...




Generating responses for Refrigirator reviews using Qwen/Qwen2.5-7B-Instruct...
Generating responses for The LEGO reviews using Qwen/Qwen2.5-7B-Instruct...


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Generating responses for Electric Bike reviews using Qwen/Qwen2.5-1.5B...
Generating responses for Refrigirator reviews using Qwen/Qwen2.5-1.5B...
Generating responses for The LEGO reviews using Qwen/Qwen2.5-1.5B...
Failed to load model google/gemma-2-9b, error: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-2-9b.
403 Client Error. (Request ID: Root=1-6727fb89-381b7ed509d6632e614f2d64;9c96183e-3242-4448-8473-f5ff1163e417)

Cannot access gated repo for url https://huggingface.co/google/gemma-2-9b/resolve/main/config.json.
Access to model google/gemma-2-9b is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2-9b to ask for access.
Failed to load model google/gemma-2-2b, error: name 'nlp' is not defined
Failed to load model meta-llama/Llama-3.2-3B-Instruct, error: name 'nlp' is not defined


In [22]:
with open("responses.pkl", "wb") as f:
    pickle.dump(responses, f)

In [9]:
for response in responses:
    # try:
    llm_response = response[0]
    if not isinstance(llm_response, dict):
        llm_response = json.loads(
            llm_response.replace("```json\n", "").replace("\n```", "")
        )
    score = [metric(response[1], llm_response) for metric in similarity_metrics]
    response.extend(score)
    # except Exception as e:
    #     print(f"Failed to calculate similarity score, error: {e}")
    #     continue

TypeError: float() argument must be a string or a real number, not 'dict'

#### Results

In [8]:
with open("responses.pkl", "rb") as f:
    responses = pickle.load(f)

In [25]:
columns = ["response", "review", "model"] + [
    metric.__name__ for metric in similarity_metrics
]
results_df = pd.DataFrame(responses, columns=columns)

ValueError: 9 columns passed, passed data had 3 columns

In [None]:
# To plot the barplots, we need to transform the results, each row should contain a model name, a metric name, and the similarity score
# Then plot x-axis as model name, y-axis as similarity score, and hue as metric name

# Transform the results#
results_transformed = results_df.melt(
    id_vars=["response", "review", "model"],
    var_name="similarity_metric",
    value_name="similarity_score",
)