In [18]:
from dotenv import load_dotenv
load_dotenv()

True

In [9]:
from docx import Document
import json

def extract_data(file_path):
    doc = Document(file_path)
    data = []
    
    for table in doc.tables:
        for row in table.rows[1:]:  # Пропускаем заголовок
            cells = row.cells
            item = {
                "id": len(data) + 1,  # Генерация ID
                "name": cells[0].text.strip(),
                "image_text": cells[1].text.strip(),
                "description": cells[2].text.strip(),
                "price": cells[3].text.strip(),  # Преобразуем в число
                "currency": "руб.",
                "text_for_embedding": f"{cells[0].text.strip()} {cells[1].text.strip()} {cells[2].text.strip()}",  # Для ChromaDB
                "source": file_path
            }
            data.append(item)
    
    return data

# Сохранение в JSON
data = extract_data("КП/КП артстрой.docx")
dataset = {"products": data}
with open("products.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)


In [36]:
import os 

for file in os.listdir("КП"):
    doc = Document(f"КП/{file}")
    text = doc.paragraphs[1].text.strip()
    result = re.sub(r'[а-яА-Я\s]', '', text)
    print(result)


04.09.2024.


17.01.2023



02.02.2023.
17.01.2023

13.04.2023
12.04.2023
19.06.2023
31.03.2023.
03.04.2023.
16.05.2023
17.05.2023.
23.11.2023.
23.11.2023.
12.01.2023
12.12.2022
2.08.2024.
23.1.2024.
23.11.2023.
21.12.2023.
01.06.2023.
24.03.2023.
27.01.2023.
13.01.2023
17.01.2024.
17.05.2023
20.06.2023
02.02.2023.
2.02.2024.
12.04.2023
21.04.2023.
29.01.2024.

26.01.2023.
13.04.2023


In [1]:
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain.document_loaders import JSONLoader

def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["name"] = record.get("name")
    metadata["price"] = record.get("price")
    metadata["currency"] = record.get("currency")
    return metadata


loader = JSONLoader(
    file_path="products.json",
    jq_schema=".[]",
    content_key="text_for_embedding",
    metadata_func=metadata_func
    #text_content=False
)

documents = loader.load()

embedding_function = HuggingFaceEmbeddings()

vector_db = Chroma.from_documents(documents, embedding=embedding_function)


  embedding_function = HuggingFaceEmbeddings()
  embedding_function = HuggingFaceEmbeddings()
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
retriever = vector_db.as_retriever(search_type = "mmr")

In [33]:
description = """КУРТКА:
- прямого силуэта
- с центральной супатной застёжкой на пять пуговиц и две сквозные вверху борта
- полочки и спинка с кокетками из отделочной ткани
- по кокеткам проложена СОП шириной 50 мм
- полочки с накладными карманами: одним верхними на левой полочке и двумя нижними
- верхний накладной карман с клапаном, застёгивающимся на ленты контакт
- нижние накладные карманы с наклонным входом
- рукава втачные, с трикотажной манжетой"""

In [34]:
def format_docs(docs):
    return "\n\n".join([f"Price: {doc.metadata['price'] + doc.metadata['currency']}\nContent: {doc.page_content}" for doc in docs])

In [35]:
docs = retriever.invoke(description)
formatted_docs = format_docs(docs)

In [37]:
from langchain_core.prompts import PromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain_core.output_parsers import StrOutputParser

llm =  OllamaLLM(model="deepseek-r1")

template = """
Calculate the price of the new product based on the context. User gives you the discription of the new product.
You find prices in the "Price" and description of the known products in the "Content". You need to clearly indicate the parameters
that influenced the cost and how(is it increase the price or reduce). You need to indicate the price as a specific number, if this is not possible, write why, 
what needs to be clarified, what parameters are missing.
If the context doesn't contain any relevant information to the question, don't make something up and just say "I don't know":

<context>
{context}
</context>

question: {question}

"""
prompt = PromptTemplate(input_variables=["context", "question"], template=template)

llm_chain = prompt | llm | StrOutputParser()

In [38]:
# Adjust your code to include an 'input' dictionary
input_data = {
    'context': formatted_docs,
    'question': description,
}

In [None]:
print(llm_chain.invoke(input=input_data))

In [40]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

rag_chain = {"context":retriever | RunnableLambda(format_docs), "question": RunnablePassthrough()} | llm_chain

In [41]:
print(rag_chain.invoke(description))

<think>
Alright, so I need to calculate the price of a new product based on the given context. The user has provided some existing products with their prices and descriptions. My task is to use this information to determine a reasonable price for another product.

First, let me look at the context provided. There are four different products listed:

1. **Костюм сварщика Гефест-СОП 2 кл. защиты (тк. Хлопок-ОП,420) брюки, черный**  
   - Price: 7890 руб.

2. **Валенки обрезиненные**  
   - Price: 1870 руб.

3. **Костюм зимний Труженик-Ультра-2 (тк. Смесовая,210) п/к, т.синий/васильковый Ткань "Грета" 65% полиэфир / 35% хлопок (+/-5%),密度 210 гр/м², ВО-пропитка**  
   - Price: 4000 руб.

4. **Сапоги мужские Войлочные Ангора ЭВА Step САПОГИ**  
   - Price: 1085 руб.

The user's question is about a new product, which is another variant of a **куртка (coat)** with specific features. The description provided includes various parameters like the silhouette, central button placket with two side 