In [2]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim, pairwise_cos_sim
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [3]:
model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

In [13]:
shet = ["молоток", "кирпич", "плинтус 10"]
zayavka = ["плинтус 10", "молоток с деревянной ручкой",  "плинтус 20", "кирпич силикатный", "плинтус 30"]

shet_instruction = (
    "Retrieve semantically similar names of building supplies and materials in Russian: "
)
zayavka_instruction = "Represent names of building supplies and materials in Russian for retrieval: "

zayavka_emb = model.encode(zayavka, prompt=zayavka_instruction)
shet_emb = model.encode(shet, prompt=shet_instruction)
# similarities = pairwise_cos_sim(zayavka_emb, shet_emb)
# print(similarities)
# => tensor([[0.8835, 0.7037, 0.6970]])

In [27]:
scores = []
for i,s_item_emb in enumerate(shet_emb):
    sc = cos_sim(s_item_emb, zayavka_emb)
    print(shet[i], zayavka[np.argmax(sc).item()])
    scores.append(sc)

молоток молоток с деревянной ручкой
кирпич кирпич силикатный
плинтус 10 плинтус 10


In [15]:
scores

[tensor([[0.9073, 0.9555, 0.9058, 0.9161, 0.9060]]),
 tensor([[0.9096, 0.9064, 0.9082, 0.9594, 0.9079]]),
 tensor([[0.9823, 0.9004, 0.9580, 0.9172, 0.9588]])]

In [5]:
from openai import OpenAI
client = OpenAI(api_key =API_KEY)


response = client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                "role": "system",
                "content": "You are a data extraction tool that extracts product attributes from their Russian-language descriptions."
            },
            {"role": "user", "content": query}
        ],
        temperature=0,
    )
print(response.choices[0].message.content)

[
    {
        "main_description": "Отвод 45 из оцинкованной стали",
        "brand_or_specification": null,
        "dimensional_parameters": "б=0,8мм 1200х400",
        "other": null
    },
    {
        "main_description": "Болгарка (УШМ)",
        "brand_or_specification": "Makita 9558 HN 13535954",
        "dimensional_parameters": null,
        "other": null
    }
]


In [1]:
from pprint import pprint
test = ['Отвод 45 из оцинкованной стали б=0,8мм 1200х400','Болгарка (УШМ) Makita 9558 HN 13535954']
query = f"""I will provide you with a list of strings containing descriptions of the building materials and supplies. Your goal is to extract structured information from these strings that matches the form described below:
{{
main_description // text containing the product name.
brand_or_specification // parameters describing the brand and/or specification, but not the dimensions.
dimensional_parameters // length, width, thickness, volume, etc.
other // parameters that do not match any of the previous classes.
}}
### Example:
Input: ['Труба стальная электросварная прямошовная оцинкованная Ø76 x 3.5мм ГОСТ 10704-91', 'Маслоотделитель FP-OS-5,0-138 ФРИГОПОИНТ (шильда прикреплена)']
Output: [{{'main_description': 'труба стальная электросварная прямошовная оцинкованная', 'brand_or_specification': 'ГОСТ 10704-91', 'dimensional_parameters':  'Ø76 x 3.5мм', 'other': None}}, {{'main_description': 'Маслоотделитель', 'brand_or_specification': 'FP-OS-5,0-138 ФРИГОПОИНТ', 'dimensional_parameters':  None, 'other': '(шильда прикреплена)'}}]

If there are no characters in the string that relate to a specific field, then fill it with the value "None". Please output the extracted information in JSON format. Do not output anything except for the extracted information. Do not add any clarifying information. Do not add any fields that are not in the schema. All output must be in JSON format and follow the schema specified above. 
###
Extract the structural information from the following strings:
{test}
"""

pprint(query)

('I will provide you with a list of strings containing descriptions of the '
 'building materials and supplies. Your goal is to extract structured '
 'information from these strings that matches the form described below:\n'
 '{\n'
 'main_description // text containing the product name.\n'
 'brand_or_specification // parameters describing the brand and/or '
 'specification, but not the dimensions.\n'
 'dimensional_parameters // length, width, thickness, volume, etc.\n'
 'other // parameters that do not match any of the previous classes.\n'
 '}\n'
 '### Example:\n'
 "Input: ['Труба стальная электросварная прямошовная оцинкованная Ø76 x 3.5мм "
 "ГОСТ 10704-91', 'Маслоотделитель FP-OS-5,0-138 ФРИГОПОИНТ (шильда "
 "прикреплена)']\n"
 "Output: [{'main_description': 'труба стальная электросварная прямошовная "
 "оцинкованная', 'brand_or_specification': 'ГОСТ 10704-91', "
 "'dimensional_parameters':  'Ø76 x 3.5мм', 'other': None}, "
 "{'main_description': 'Маслоотделитель', 'brand_or_specifi