In [1]:
import os
from configparser import ConfigParser
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, Field
from pymystem3 import Mystem

# import spacy
from spacy import load
# import spacy.cli

# spacy.cli.download("ru_core_news_sm")

In [2]:
CONFIG_PATH = os.path.abspath("../config.ini")
os.path.isfile(CONFIG_PATH)

True

In [3]:
config = ConfigParser()
config.read(CONFIG_PATH)

llm_vendors = config["llm_vendors"]
env_file = Path(config["env"]["env_file"]).as_posix()


load_dotenv(env_file)


True

In [4]:
DATA_PATH = Path(os.path.abspath("../data/raw")).as_posix()
PROCESSED_DATA_PATH = Path(os.path.abspath("../data/processed")).as_posix()
EXTERNAL_PATH = Path(os.path.abspath("../data/external")).as_posix()
INTERIM_PATH = Path(os.path.abspath("../data/interim")).as_posix()

In [5]:
class Prompt(BaseModel):
    """Модель для представления сообщения в диалоге

    Attributes:
        role (str): Роль отправителя сообщения (system, user или assistant)
        content (str): Текст сообщения
    """

    role: str = Field(description="Роль в диалоге (system, user, assistant)")
    content: str = Field(description="Содержание сообщения")

In [6]:
# Сильно хуже!

# class Company(BaseModel):
#     name: Optional[str] = Field(description="Company name")


class Companies(BaseModel):
    names: Optional[list[str]] = Field(description="Companies names")

In [7]:
import requests


def get_service_models(url: str, api_key: str) -> list:
    """Просмотр доступных моделей у поставщика

    Args:
        url (str): URL-адрес поставщика
        key_name (str): название API-ключа

    Returns:
        list: список доступных моделей
    """
    try:
        response = requests.get(
            f"{url}models",
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {api_key}",
            },
        )
        response.raise_for_status()
        models = response.json().get("data", [])
        return [model["id"] for model in models if "id" in model]
    except requests.exceptions.RequestException as e:
        print(f"Ошибка при получении списка моделей: {e}")
        return []

In [29]:
def get_chat_completion_parse(
    base_url: str,
    api_key: str,
    model: str,
    temperature: float = 0.3,
    messages: Optional[list[Prompt]] = None,
    max_tokens: int = 4096,
) -> str:
    """Выполняет запрос к языковой модели

    Args:
        url (str): базовый URL API
        api_key (str): ключ API
        model (str): идентификатор модели
        temperature (float, optional): параметр температуры. Defaults to 0.5.
        messages (Optional[list[Prompt]], optional): список сообщений. Defaults to None.
        max_tokens (int, optional): максимальное количество токенов. Defaults to 1500.

    Returns:
        str: ответ модели
    """
    client = OpenAI(api_key=api_key, base_url=base_url)
    try:
        completion = client.beta.chat.completions.parse(
            model=model,
            messages=messages,
            response_format=Companies,
            temperature=temperature,
            max_tokens=max_tokens,
            frequency_penalty=0.9,
        )
        event = completion.choices[0].message.parsed
        return event.names
    except Exception:
        return []
    # except pydantic.ValidationError as e:
    #     return []
    # except AttributeError as e:
    #     return []
    # except TypeError as e:
    #     return []

In [30]:
from time import sleep

from openai import RateLimitError
from tqdm import tqdm


def llm_ner(
    base_url="http://127.0.0.1:11434/v1",
    api_key="lm-studio",
    model="deepseek-r1-distill-qwen-32b",
    texts: Optional[list[str]] = None,
):
    system_prompt = Prompt(
        role="system",
        content="Instructions:"
        "You are given one or more user review texts. Extract all unique company names (brands, organizations, firms) and output a valid JSON following the Company schema."
        'If no companies are found, "names" should be an empty list.'
        ""
        "Output Format:"
        "{"
        '"names": ["Company1", "Company2", ...]'
        "}"
        ""
        "Example:"
        "Input: \"I visited 'GastroMania' restaurant and got a gadget from 'TechWorld'.\""
        "Output:"
        "{"
        '"names": ["GastroMania", "TechWorld"]'
        "}"
        ""
        "Do not include any extra text. Begin processing with the following review:",
    )
    names = []
    for text in tqdm(texts):
        try:
            sleep(1)
            feedback = Prompt(role="user", content=text)
            llm_companies = get_chat_completion_parse(
                base_url=base_url,
                api_key=api_key,
                model=model,
                messages=[system_prompt, feedback],
            )
            # print(llm_companies)
            names.append(llm_companies)
        except RateLimitError:
            names.append([])
    return names

In [10]:
def prepare_dataframe(sheet_path):
    if sheet_path.endswith(".xlsx"):
        df = pd.read_excel(sheet_path)
    elif sheet_path.endswith(".csv"):
        df = pd.read_csv(sheet_path)
    else:
        raise ValueError("Unsupported file format")

    df.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True)
    df.rename(
        columns={
            "отзывы (2-100)посты(100-150) НОВОСТЬ (ТЕКСТ)": "feedback",
            "Название компании упоминаемой": "companies_name",
        },
        inplace=True,
    )
    df.fillna({"companies_name": "---"}, inplace=True)
    return df

In [11]:
def save_ner_results(ner_companies: list[list[str]], model_name: str):
    result = pd.DataFrame(
        [", ".join(comp) for comp in ner_companies], columns=["llm_companies"]
    )
    saved_path = Path(INTERIM_PATH) / f"{model_name}.csv"
    result.to_csv(saved_path, index=False)


def load_ner_results(model_name: str):
    load_path = Path(INTERIM_PATH) / f"{model_name}.csv"
    df = pd.read_csv(load_path)
    df.fillna({"llm_companies": "---"}, inplace=True)
    return df


In [12]:
def save_cleaned_results(companies: list[list[str]], model_name: str, method: str):
    joined = [" ".join(comps) for comps in companies]
    result = pd.DataFrame(joined, columns=["llm_companies"])
    saved_path = Path(INTERIM_PATH) / f"{model_name}_{method}_cleaned.csv"
    result.to_csv(saved_path, index=False)
    if os.path.isfile(saved_path):
        return True
    return False


def load_cleaned_results(model_name: str, method: str) -> list[list[str]]:
    saved_path = Path(INTERIM_PATH) / f"{model_name}_{method}_cleaned.csv"
    comps_df = pd.read_csv(saved_path)
    result = [comps[0].split() for comps in comps_df.values.tolist()]
    return result

In [13]:
def clean_lemmatized(lemmatized_companies: list[list[str]]) -> list[list[str]]:
    cleaned_companies = [
        [comp.strip("\n") for comp in comps if comp.isalnum() or comp == "---\n"]
        for comps in lemmatized_companies
    ]
    result = [["---"] if comp == [] else comp for comp in cleaned_companies]

    return result


In [14]:
def spacy_lem_companies(feedbacks: list[list[str]]):
    load_model = load("ru_core_news_sm")
    lemmas = []
    for companies in feedbacks:
        lemma = []
        for company in load_model.pipe(companies):
            lemma.append([n.lemma_ for n in company])
        lemmas.append(lemma[0])
    return lemmas

In [15]:
def stem_lem_companies(feedbacks: list[list[str]]):
    stem = Mystem()
    result = []
    companies_joined = list(map(lambda x: " ".join(x), feedbacks))
    for companies in companies_joined:
        result.append(stem.lemmatize(companies))
    return result

In [16]:
def jaccard_similarity(doc1, doc2):
    words_doc1 = set(doc1)
    words_doc2 = set(doc2)
    intersection = words_doc1.intersection(words_doc2)
    union = words_doc1.union(words_doc2)
    return float(len(intersection)) / len(union)


def jaccard_feedbacks(ner_llm: list[list[str]], labels: list[list[str]]) -> float:
    assert len(ner_llm) == len(labels), "Длины списков должны совпадать"
    scores = []
    for pred, label in zip(ner_llm, labels):
        scores.append(jaccard_similarity(pred, label))
    jaccard_metric = np.round(np.mean(scores), 2)
    return jaccard_metric


In [43]:
def save_jaccard_results(llm_jaccard) -> None:
    with open(Path(PROCESSED_DATA_PATH) / "llm_jaccard.csv", "a") as f:
        for model, jaccard in llm_jaccard.items():
            f.write(f"{model},{jaccard}\n")
    status = os.path.isfile(Path(PROCESSED_DATA_PATH) / "llm_jaccard.csv")
    return status


In [18]:
df = prepare_dataframe(os.path.join(DATA_PATH, "NewsList_1.csv"))
texts = df["feedback"].tolist()
lowered_labels = [comps.lower() for comps in df["companies_name"].values.tolist()]
texts[:2]

['Потому что абхазские мандарины на фуд сити или рынках, а не в пятёрочке 😂 \r\nЯ вообще после Краснодара удивляюсь, как люди в Москве покупают зелёную хурму например)) \r\nНо для меня, самые вкусны мандарины – Марокко\r',
 'Последняя в коллабе с пивоварней питерской Газ Брю Крутые ребята Я к ним ходил в бар-пивоварню на экскурсию \nСоветую для приезжих и живущих\n']

In [19]:
labels_no_lemma = [comps.split(", ") for comps in lowered_labels]

# LLM baseline

In [20]:
llm_jaccard = {}

## **Local**

## gemma-2-2b-it

In [25]:
model = "gemma-2-2b-it"

In [37]:
# llm_companies = llm_ner(base_url=llm_vendors['lm-studio'],
#                         api_key=os.getenv('LM_STUDIO'),
#                         model=model,
#                         texts=texts)


100%|██████████| 149/149 [11:17<00:00,  4.55s/it]


In [38]:
# save_ner_results(llm_companies, model)

In [39]:
# ner_results = load_ner_results(model)

### Lemmatization

#### Spacy

In [40]:
# ner_lemm = spacy_lem_companies(ner_results.values.tolist())
# ner_lemm_ = clean_lemmatized(ner_lemm)

In [41]:
# assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"

**Jaccard distance**

In [26]:
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)

In [27]:
llm_jaccard[model] = jaccard_metric.astype(float)

In [28]:
llm_jaccard

{'ministral-3b-latest': np.float64(0.43), 'gemma-2-2b-it': np.float64(0.28)}

### pymystem3

In [None]:
h3b_results_lemm_stem = stem_lem_companies(h3b_results.values.tolist())
h3b_results_lemm_stem_ = clean_lemmatized(h3b_results_lemm_stem)


In [None]:
assert save_cleaned_results(h3b_results_lemm_stem_, "hermes-3-llama-3.2-3b", "stem"), (
    "File is not saved!"
)

**Jaccard distance**

In [32]:
h3b_results_lemm_stem__ = load_cleaned_results("hermes-3-llama-3.2-3b", "stem")

jaccard_h3b_stem = []

assert len(h3b_results_lemm_stem__) == len(labels_no_lemma), (
    "Lengths are the different!"
)

for i, comps in enumerate(h3b_results_lemm_stem__):
    jaccard_h3b_stem.append(jaccard_similarity(comps, labels_no_lemma[i]))

jaccard_h3b_stem_metrics = np.mean(jaccard_h3b_stem)
print(f"{jaccard_h3b_stem_metrics:.2f}")

0.06


## ministral-3b-instruct

In [29]:
model = "ministral-3b-instruct"

In [30]:
# llm_companies = llm_ner(base_url=llm_vendors['lm-studio'],
#                         api_key=os.getenv('LM_STUDIO'),
#                         model=model,
#                         texts=texts)

In [64]:
# save_ner_results(llm_companies, model)

In [65]:
# ner_results = load_ner_results(model)

### Lemmatization

In [None]:
# ner_lemm = spacy_lem_companies(ner_results.values.tolist())
# ner_lemm_ = clean_lemmatized(ner_lemm)

In [67]:
# assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"

**Jaccard distance**

In [31]:
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)

In [32]:
llm_jaccard[model] = jaccard_metric.astype(float)

In [33]:
llm_jaccard[model]

np.float64(0.0)

In [34]:
llm_jaccard

{'ministral-3b-latest': np.float64(0.43),
 'gemma-2-2b-it': np.float64(0.28),
 'ministral-3b-instruct': np.float64(0.0)}

## deepseek-r1-distill-qwen-1.5b

In [35]:
model = "deepseek-r1-distill-qwen-1.5b"

In [36]:
# llm_companies = llm_ner(base_url=llm_vendors['lm-studio'],
#                         api_key=os.getenv('LM_STUDIO'),
#                         model=model,
#                         texts=texts)

In [146]:
# save_ner_results(llm_companies, model)

In [147]:
# ner_results = load_ner_results(model)

### Lemmatization

In [148]:
# ner_lemm = spacy_lem_companies(ner_results.values.tolist())
# ner_lemm_ = clean_lemmatized(ner_lemm)

In [149]:
# assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"

**Jaccard distance**

In [37]:
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)

In [38]:
llm_jaccard[model] = jaccard_metric.astype(float)

In [39]:
llm_jaccard

{'ministral-3b-latest': np.float64(0.43),
 'gemma-2-2b-it': np.float64(0.28),
 'ministral-3b-instruct': np.float64(0.0),
 'deepseek-r1-distill-qwen-1.5b': np.float64(0.13)}

## **Small**

## ministral-3b-latest

In [None]:
model = "ministral-3b-latest"

In [None]:
# llm_companies = llm_ner(base_url=llm_vendors['mistral'],
#                         api_key=os.getenv('MISTRAL_API_KEY'),
#                         model=model,
#                         texts=texts)


In [None]:
# save_ner_results(llm_companies, model)

In [None]:
# m3b_results = load_ner_results('ministral-3b-latest')

### Lemmatization

#### Spacy

In [None]:
# m3b_results_lemm_spacy = spacy_lem_companies(m3b_results.values.tolist())
# m3b_results_lemm_spacy_ = clean_lemmatized(m3b_results_lemm_spacy)

In [None]:
# assert save_cleaned_results(m3b_results_lemm_spacy_, 'ministral-3b-latest', "spacy"), "File is not saved!"

**Jaccard distance**

In [None]:
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)
jaccard_metric

np.float64(0.43)

In [None]:
llm_jaccard[model] = jaccard_metric.astype(float)

In [None]:
llm_jaccard

{'ministral-3b-latest': np.float64(0.43)}

#### pymystem3

In [None]:
m3b_results_lemm_stem = stem_lem_companies(m3b_results.values.tolist())
m3b_results_lemm_stem_ = clean_lemmatized(m3b_results_lemm_stem)


In [None]:
assert save_cleaned_results(m3b_results_lemm_stem_, "ministral-3b-latest", "stem"), (
    "File is not saved!"
)

**Jaccard distance**

In [None]:
m3b_results_lemm_stem__ = load_cleaned_results("ministral-3b-latest", "stem")

jaccard_m3b_stem = []

assert len(m3b_results_lemm_stem__) == len(labels_no_lemma), (
    "Lengths are the different!"
)

for i, comps in enumerate(m3b_results_lemm_stem__):
    jaccard_m3b_stem.append(jaccard_similarity(comps, labels_no_lemma[i]))

jaccard_m3b_stem_metrics = np.mean(jaccard_m3b_stem)
print(f"{jaccard_m3b_stem_metrics:.2f}")

## llama-3-8b

In [132]:
models = requests.get(
    "https://bothub.chat/api/v2/model/list?children=1",
    headers={
        "Content-Type": "application/json",
        "Authorization": os.getenv("BOT_HUB_KEY"),
    },
).json()

sorted([model.get("id") for model in models])

['claude-3-opus',
 'claude-3.5-haiku',
 'claude-3.5-haiku-20241022',
 'claude-3.5-sonnet',
 'claude-3.5-sonnet-20240620',
 'codestral-2501',
 'codestral-mamba',
 'command-r-plus-08-2024',
 'dall-e-3',
 'dbrx-instruct',
 'deepseek-chat',
 'deepseek-chat-v2.5',
 'deepseek-r1',
 'deepseek-r1-distill-llama-70b',
 'deepseek-r1-distill-qwen-14b',
 'deepseek-r1-distill-qwen-32b',
 'dolphin-mixtral-8x7b',
 'eva-llama-3.33-70b',
 'eva-qwen-2.5-32b',
 'eva-qwen-2.5-72b',
 'fimbulvetr-11b-v2',
 'flux',
 'flux-1.1-pro',
 'flux-1.1-pro-ultra',
 'flux-dev',
 'flux-dev-lora',
 'flux-dev-multi-lora',
 'flux-fill-dev',
 'flux-fill-pro',
 'flux-pro',
 'flux-pulid',
 'flux-schnell',
 'flux-schnell-lora',
 'gemini-2.0-flash-001',
 'gemini-2.0-flash-exp:free',
 'gemini-2.0-flash-lite-preview-02-05:free',
 'gemini-2.0-flash-thinking-exp-1219:free',
 'gemini-2.0-flash-thinking-exp:free',
 'gemini-2.0-pro-exp-02-05:free',
 'gemini-exp-1206:free',
 'gemini-flash-1.5',
 'gemini-flash-1.5-8b',
 'gemini-flash-1.5

In [40]:
# НЕ РАБОТАЕТ!
model = "llama-3.1-8b-instruct"
# llm_companies = llm_ner(base_url=llm_vendors['bothub'],
#                         api_key=os.getenv('BOT_HUB_KEY'),
#                         model=model,
#                         texts=texts)

In [163]:
# save_ner_results(llm_companies, model)
# llm_results = load_ner_results(model)

### Lemmatization

In [164]:
# ner_lemm = spacy_lem_companies(llm_results.values.tolist())
# ner_lemm_ = clean_lemmatized(ner_lemm)
# assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"


**Jaccard distance**

In [41]:
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)
jaccard_metric

np.float64(0.36)

In [42]:
llm_jaccard[model] = jaccard_metric

## qwen-2.5-7b-instruct

In [43]:
model = "qwen-2.5-7b-instruct"
# llm_companies = llm_ner(base_url=llm_vendors['bothub'],
#                         api_key=os.getenv('BOT_HUB_KEY'),
#                         model=model,
#                         texts=texts)

# save_ner_results(llm_companies, model)
# grok_results = load_ner_results(model)
# ### Lemmatization
# grok_lemm = spacy_lem_companies(grok_results.values.tolist())
# grok_lemm_ = clean_lemmatized(grok_lemm)
# assert save_cleaned_results(grok_lemm_, model, "spacy"), "File is not saved!"

### Jaccard distance
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)
llm_jaccard[model] = jaccard_metric.astype(float)


In [44]:
llm_jaccard

{'ministral-3b-latest': np.float64(0.43),
 'gemma-2-2b-it': np.float64(0.28),
 'ministral-3b-instruct': np.float64(0.0),
 'deepseek-r1-distill-qwen-1.5b': np.float64(0.13),
 'llama-3.1-8b-instruct': np.float64(0.36),
 'qwen-2.5-7b-instruct': np.float64(0.49)}

In [58]:
pd.DataFrame.from_dict(llm_jaccard, orient="index", columns=["jaccard_metric"]).to_csv(
    Path(PROCESSED_DATA_PATH) / "llm_jaccard.csv", index=True
)

---

## **Middle**


## llama-3.3-70b-instruct

In [62]:
model = "llama-3.3-70b-instruct"
llm_companies = llm_ner(
    base_url=llm_vendors["bothub"],
    api_key=os.getenv("BOT_HUB_KEY"),
    model=model,
    texts=texts,
)

save_ner_results(llm_companies, model)
ner_results = load_ner_results(model)
### Lemmatizationb
ner_lemm = spacy_lem_companies(ner_results.values.tolist())
ner_lemm_ = clean_lemmatized(ner_lemm)
assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"

### Jaccard distance
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)
llm_jaccard[model] = jaccard_metric.astype(float)
pd.DataFrame.from_dict(llm_jaccard, orient="index", columns=["jaccard_metric"]).to_csv(
    Path(PROCESSED_DATA_PATH) / "llm_jaccard.csv", index=True
)

100%|██████████| 149/149 [07:42<00:00,  3.10s/it]


In [64]:
llm_jaccard

{'ministral-3b-latest': np.float64(0.43),
 'gemma-2-2b-it': np.float64(0.28),
 'ministral-3b-instruct': np.float64(0.0),
 'deepseek-r1-distill-qwen-1.5b': np.float64(0.13),
 'llama-3.1-8b-instruct': np.float64(0.36),
 'qwen-2.5-7b-instruct': np.float64(0.49),
 'llama-3.3-70b-instruct': np.float64(0.46)}

## Mistral Large

In [65]:
get_service_models(llm_vendors["mistral"], os.getenv("MISTRAL_API_KEY"))

['ministral-3b-2410',
 'ministral-3b-latest',
 'ministral-8b-2410',
 'ministral-8b-latest',
 'open-mistral-7b',
 'mistral-tiny',
 'mistral-tiny-2312',
 'open-mistral-nemo',
 'open-mistral-nemo-2407',
 'mistral-tiny-2407',
 'mistral-tiny-latest',
 'open-mixtral-8x7b',
 'mistral-small',
 'mistral-small-2312',
 'open-mixtral-8x22b',
 'open-mixtral-8x22b-2404',
 'mistral-small-2402',
 'mistral-small-2409',
 'mistral-medium-2312',
 'mistral-medium',
 'mistral-medium-latest',
 'mistral-large-2402',
 'mistral-large-2407',
 'mistral-large-2411',
 'mistral-large-latest',
 'pixtral-large-2411',
 'pixtral-large-latest',
 'mistral-large-2502-15-1-rc2',
 'mistral-large-pixtral-2411',
 'codestral-2405',
 'codestral-2501',
 'codestral-latest',
 'codestral-2412',
 'codestral-2411-rc5',
 'codestral-mamba-2407',
 'open-codestral-mamba',
 'codestral-mamba-latest',
 'pixtral-12b-2409',
 'pixtral-12b',
 'pixtral-12b-latest',
 'mistral-small-2501',
 'mistral-small-latest',
 'mistral-saba-2502',
 'mistral-sa

In [66]:
model = "mistral-large-latest"
llm_companies = llm_ner(
    base_url=llm_vendors["mistral"],
    api_key=os.getenv("MISTRAL_API_KEY"),
    model=model,
    texts=texts,
)

save_ner_results(llm_companies, model)
ner_results = load_ner_results(model)
### Lemmatizationb
ner_lemm = spacy_lem_companies(ner_results.values.tolist())
ner_lemm_ = clean_lemmatized(ner_lemm)
assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"

### Jaccard distance
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)
llm_jaccard[model] = jaccard_metric.astype(float)
pd.DataFrame.from_dict(llm_jaccard, orient="index", columns=["jaccard_metric"]).to_csv(
    Path(PROCESSED_DATA_PATH) / "llm_jaccard.csv", index=True
)

100%|██████████| 149/149 [05:08<00:00,  2.07s/it]


## llama-3.1-405b-instruct

In [70]:
model = "llama-3.1-405b-instruct"
llm_companies = llm_ner(
    base_url=llm_vendors["bothub"],
    api_key=os.getenv("BOT_HUB_KEY"),
    model=model,
    texts=texts,
)

save_ner_results(llm_companies, model)
ner_results = load_ner_results(model)
### Lemmatizationb
ner_lemm = spacy_lem_companies(ner_results.values.tolist())
ner_lemm_ = clean_lemmatized(ner_lemm)
assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"

### Jaccard distance
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)
llm_jaccard[model] = jaccard_metric.astype(float)
save_jaccard_results(llm_jaccard)


100%|██████████| 149/149 [09:06<00:00,  3.67s/it]


True

In [71]:
jaccard_metric

np.float64(0.52)

## **Large**

## claude-3.5-sonnet

In [72]:
model = "claude-3.5-sonnet"
llm_companies = llm_ner(
    base_url=llm_vendors["bothub"],
    api_key=os.getenv("BOT_HUB_KEY"),
    model=model,
    texts=texts,
)

save_ner_results(llm_companies, model)
ner_results = load_ner_results(model)
### Lemmatizationb
ner_lemm = spacy_lem_companies(ner_results.values.tolist())
ner_lemm_ = clean_lemmatized(ner_lemm)
assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"

### Jaccard distance
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)
llm_jaccard[model] = jaccard_metric.astype(float)
save_jaccard_results(llm_jaccard)

100%|██████████| 149/149 [10:55<00:00,  4.40s/it]


True

In [73]:
jaccard_metric

np.float64(0.63)

## DeepSeek-R1

In [None]:
model = "deepseek-r1"
llm_companies = llm_ner(
    base_url=llm_vendors["bothub"],
    api_key=os.getenv("BOT_HUB_KEY"),
    model=model,
    texts=texts,
)

save_ner_results(llm_companies, model)
ner_results = load_ner_results(model)
### Lemmatizationb
ner_lemm = spacy_lem_companies(ner_results.values.tolist())
ner_lemm_ = clean_lemmatized(ner_lemm)
assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"

### Jaccard distance
ner_lemm__ = load_cleaned_results(model, "spacy")
jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)
llm_jaccard[model] = jaccard_metric.astype(float)
save_jaccard_results(llm_jaccard)


 17%|█▋        | 25/149 [14:21<1:14:25, 36.01s/it]

In [None]:
jaccard_metric

## o3-mini

In [41]:
llm_jaccard

{'o3-mini': np.float64(0.45)}

In [44]:
# model = "o3-mini"
# llm_companies = llm_ner(base_url=llm_vendors['bothub'],
#                         api_key=os.getenv('BOT_HUB_KEY'),
#                         model=model,
#                         texts=texts)

# save_ner_results(llm_companies, model)
# ner_results = load_ner_results(model)
# ### Lemmatizationb
# ner_lemm = spacy_lem_companies(ner_results.values.tolist())
# ner_lemm_ = clean_lemmatized(ner_lemm)
# assert save_cleaned_results(ner_lemm_, model, "spacy"), "File is not saved!"

# ### Jaccard distance
# ner_lemm__ = load_cleaned_results(model, "spacy")
# jaccard_metric = jaccard_feedbacks(ner_lemm__, labels_no_lemma)
# llm_jaccard[model] = jaccard_metric.astype(float)
save_jaccard_results(llm_jaccard)


True

In [33]:
jaccard_metric = pd.read_csv(Path(PROCESSED_DATA_PATH) / "llm_jaccard.csv")

In [None]:
jaccard_metric.rename(columns={"Unnamed: 0": "model"}, inplace=True)

In [40]:
jaccard_metric.to_dict()

{'model': {0: 'ministral-3b-latest',
  1: 'gemma-2-2b-it',
  2: 'ministral-3b-instruct',
  3: 'deepseek-r1-distill-qwen-1.5b',
  4: 'llama-3.1-8b-instruct',
  5: 'qwen-2.5-7b-instruct',
  6: 'llama-3.3-70b-instruct',
  7: 'mistral-large-latest',
  8: 'llama-3.1-405b-instruct',
  9: 'claude-3.5-sonnet',
  10: 'o3-mini'},
 'jaccard_metric': {0: 0.43,
  1: 0.28,
  2: 0.0,
  3: 0.13,
  4: 0.36,
  5: 0.49,
  6: 0.46,
  7: 0.47,
  8: 0.52,
  9: 0.63,
  10: 0.45}}