In [None]:
from google.colab import drive
drive.mount('/content/drive')

root_path = "/content/drive/MyDrive/Diploma-mag"


In [None]:
import os
from dotenv import load_dotenv, find_dotenv

from langchain_gigachat import Gigachat
from langchain_gigachat.embeddings import GigaChatEmbeddings


load_dotenv(find_dotenv())

def init_gigachat():
    return Gigachat(credentials="ключ_авторизации", model="GigaChat-Max", verify_ssl_certs=False, temperature=1e-15, timeout=100)


def init_gigachat_embeddings():
    return GigaChatEmbeddings(credentials="ключ_авторизации", scope-"GIGACHAT_API_PER", verify_ssl_certs=False)

In [None]:
llm = init_gigachat()
embeddings = init_gigachat_embeddings()

In [None]:
from langchain.storage import InMemoryStore

llm = init_gigachat()
embeddings = init_gigachat_embeddings()

id_key = "doc_id"
doc_ids = []

docstore_dir = "./data/multimodal_rag_with_summaries/doc_store"
vectorstore_dir = "./data/multimodal_rag_with_summaries/vectorstore"


docstore = InMemoryStore()

In [None]:
from langchain.retrievers import MultiVectorRetriever

from langchain_chroma import Chroma
from chromadb.config import Settings


text_vectorstore = Chroma(
    persist_directory=vectorstore_dir,
    embedding_function=embeddings,
    collection_name="mm_rag_text_gigaembeddings",
    client_settings=Settings(anonymized_telemetry=False)
)

retriever = MultiVectorRetriever(
        vectorstore=text_vectorstore,
        docstore=docstore,
        id_key=id_key
)

In [None]:
import json
with open("texts.json", "r") as f:
    documents = json.load(f)

with open("image_summary.json", "r") as f:
    summaries = json.load(f)

In [None]:
import uuid

documents_content = []
documents_page = []

for d in documents:
    documents_content.append(d["text"])
    documents_page.append(d["metadata"]["page_number"])


doc_ids = [str(uuid.uuid4()) for _ in documents_content]

In [None]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

prepared_text = []
for i, document_content in enumerate(documents_content):
    chunks = text_splitter.split_text(document_content)
    prepared_text += [Document(
        page_content=chunk,
        metadata={
            "page_number": documents_page[i],
            "doc_id": doc_ids[i]
        })
    for j, chunk in enumerate(chunks)]

all_chunks = [text.page_content for text in prepared_text]

text_vectorstore.add_documents(prepared_text)
retriever.docstore.mset(list(zip(doc_ids, documents_content)))

In [None]:
question = "Как выделить прямоугольную область на изображении в Adobe Photoshop?"
retriever.invoke(question)

In [None]:
summaries_content = []
summaries_page = []
summaries_source = []

for s in summaries:
    summaries_content.append(s["image_summary"])
    summaries_page.append(s["page_number"])
    summaries_source.append(s["source"])

summaries_ids = [str(uuid.uuid4()) for _ in summaries_content]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

prepared_text = []
for i, summary_content in enumerate(summaries_content):
    chunks = text_splitter.split_text(summary_content)
    prepared_text += [Document(
        page_content=chunk,
        metadata={
            "page_number": summaries_content[i],
            "doc_id": summaries_ids[i],
            "source": summaries_source[i]
        })
    for j, chunk in enumerate(chunks)]

retriever.vectorstore.add_documents(prepared_text)

In [None]:
question = "Как выделить прямоугольную область на изображении в Adobe Photoshop?"
retriever.invoke(question)