In [85]:
import os 
import pandas as pd

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI

from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

In [86]:
from dotenv import load_dotenv
# Load environment variables
load_dotenv('../../.env')

True

In [87]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens = 1000)

In [88]:
df_articles = pd.read_csv('../../data/df_articles.csv').dropna()
df_articles['content'] = df_articles['title'] + df_articles['content']

df_articles_analytics = pd.read_csv('../../data/df_articles_analytics.csv').dropna()
df_articles_analytics['content'] = df_articles_analytics['title'] + df_articles_analytics['content']

documents = df_articles['content'].to_list() + df_articles_analytics['content'].to_list()
with open('../../data/courses.txt', 'r') as f:
    documents += list(f.read().split('\n\n\n\n'))

In [89]:
documents = [doc.replace('\xa0', ' ').replace('\n', ' ') for doc in documents]

In [90]:
len(documents)

525

In [91]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(" ".join(documents))

In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="DeepPavlov/rubert-base-cased-sentence", model_kwargs={'device': 'cpu'})

db = Chroma.from_texts(chunks, embedding_function, persist_directory="../../artifacts/chroma_db")

No sentence-transformers model found with name DeepPavlov/rubert-base-cased-sentence. Creating a new one with mean pooling.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [125]:
import pickle

# –°–æ—Ö—Ä–∞–Ω—è–µ–º chain –∫–∞–∫ pickle
with open("../../artifacts/embedding_function.pkl", "wb") as file:
    pickle.dump(embedding_function, file)

In [93]:
retriever = db.as_retriever()

In [94]:
from langchain.prompts import ChatPromptTemplate

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [114]:
from langchain.load import dumps, loads, load

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

In [96]:
question = "–ö–∞–∫–∏–µ –∞–∫—Ü–∏–∏ IT –∫–æ–º–ø–∞–Ω–∏–π —Å–µ–π—á–∞—Å –Ω–∞–∏–±–æ–ª–µ–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã –¥–ª—è –ø—Ä–∏–æ–±—Ä–µ—Ç–µ–Ω–∏—è?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

7

In [97]:
docs

[Document(page_content='–ø–æ–µ—Ö–∞–ª–∏! –í —ç—Ç–æ–º —É—Ä–æ–∫–µ –º—ã –¥–∞–¥–∏–º –≤–∞–º —É–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω—ã–π —á–µ–∫-–ª–∏—Å—Ç. –° –Ω–∏–º –≤—ã –≤ –ª—é–±–æ–π –º–æ–º–µ–Ω—Ç –±—ã—Å—Ç—Ä–æ –æ—Ü–µ–Ω–∏—Ç–µ —Å–∏—Ç—É–∞—Ü–∏—é –≤–æ–∫—Ä—É–≥ –∏ –ø–æ–¥–±–µ—Ä–µ—Ç–µ —Å–µ–±–µ –∞–∫—Ç–∏–≤. ‚òùüèª –•–æ—Ä–æ—à–∞—è –Ω–æ–≤–æ—Å—Ç—å: —á–µ–∫-–ª–∏—Å—Ç –ø–æ–¥–æ–π–¥–µ—Ç –Ω–µ —Ç–æ–ª—å–∫–æ –¥–ª—è –≤—ã–±–æ—Ä–∞ –∞–∫—Ü–∏–π, –Ω–æ –∏ –µ—Å–ª–∏ –≤—ã –∑–∞—Ö–æ—Ç–∏—Ç–µ –ø—Ä–∏—Å–º–æ—Ç—Ä–µ—Ç—å—Å—è –∫ –æ–±–ª–∏–≥–∞—Ü–∏—è–º –∏–ª–∏ —Ñ–æ–Ω–¥–∞–º. –í–æ–æ–±—â–µ, –∫–æ–≥–¥–∞ –º—ã –≥–æ–≤–æ—Ä–∏–º –ø—Ä–æ –≤—ã–±–æ—Ä –∞–∫—Ç–∏–≤–æ–≤, –ø—Ä–∏–Ω—Ü–∏–ø –ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏ –æ–¥–∏–Ω–∞–∫–æ–≤—ã–π. –ü–æ—ç—Ç–æ–º—É –º—ã –∏ –Ω–∞–∑–≤–∞–ª–∏ —á–µ–∫-–ª–∏—Å—Ç —É–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω—ã–º üßê –ù—É —á—Ç–æ, –ø–æ–µ—Ö–∞–ª–∏! 1. –ù–æ–≤–æ—Å—Ç–∏ –ü—Ä–æ–≤–µ—Ä–∏—Ç—å —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–µ –ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏ –∫–æ–º–ø–∞–Ω–∏–∏ –ø–µ—Ä–µ–¥ –ø–æ–∫—É–ø–∫–æ–π ‚Äî –¥–µ–ª–æ –≤–∞–∂–Ω–æ–µ, –Ω–æ –Ω–µ –µ–¥–∏–Ω—Å—Ç–≤–µ–Ω–Ω–æ–µ. –ò–Ω–≤–µ—Å—Ç–æ—Ä –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å –≤ –∫—É—Ä—Å–µ 

In [98]:
from operator import itemgetter

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

res = final_rag_chain.invoke({"question":question})

print(res)

–ù–∞ –æ—Å–Ω–æ–≤–∞–Ω–∏–∏ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω–æ–≥–æ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞, –º–æ–∂–Ω–æ –≤—ã–¥–µ–ª–∏—Ç—å –Ω–µ—Å–∫–æ–ª—å–∫–æ —Ñ–∞–∫—Ç–æ—Ä–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –¥–µ–ª–∞—é—Ç –∞–∫—Ü–∏–∏ IT-–∫–æ–º–ø–∞–Ω–∏–π –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–º–∏ –¥–ª—è –ø—Ä–∏–æ–±—Ä–µ—Ç–µ–Ω–∏—è:

1. **–†–∞–∑–≤–∏—Ç–∏–µ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤**: –£–ø–æ–º–∏–Ω–∞–µ—Ç—Å—è, —á—Ç–æ –∫–æ–º–ø–∞–Ω–∏–∏, —Ä–∞–∑–≤–∏–≤–∞—é—â–∏–µ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã–µ —Ä–µ—à–µ–Ω–∏—è, –ø–æ–∫–∞–∑—ã–≤–∞—é—Ç –≤—ã—Å–æ–∫—É—é —Ä–µ–Ω—Ç–∞–±–µ–ª—å–Ω–æ—Å—Ç—å. –ù–∞–ø—Ä–∏–º–µ—Ä, –≤–∞–ª–æ–≤–∞—è –ø—Ä–∏–±—ã–ª—å –æ—Ç —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç –æ–∫–æ–ª–æ 60%, —á—Ç–æ –∑–Ω–∞—á–∏—Ç–µ–ª—å–Ω–æ –≤—ã—à–µ, —á–µ–º —É —Å—Ç–æ—Ä–æ–Ω–Ω–∏—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤ (–æ–∫–æ–ª–æ 13%). –≠—Ç–æ –¥–µ–ª–∞–µ—Ç —Ç–∞–∫–∏–µ –∫–æ–º–ø–∞–Ω–∏–∏ –±–æ–ª–µ–µ –ø—Ä–∏–≤–ª–µ–∫–∞—Ç–µ–ª—å–Ω—ã–º–∏ –¥–ª—è –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–π.

2. **–°—Ç—Ä—É–∫—Ç—É—Ä–Ω—ã–µ –∏–∑–º–µ–Ω–µ–Ω–∏—è –Ω–∞ —Ä—ã–Ω–∫–µ**: –ö–æ–º–ø–∞–Ω–∏–∏, –∫–æ—Ç–æ—Ä—ã–µ –≤—ã–∏–≥—Ä—ã–≤–∞—é—Ç –æ—Ç –ø–µ—Ä–µ—Å—Ç—

In [107]:
from langchain_core.load.serializable import to_json_not_implemented

repr = to_json_not_implemented(final_rag_chain)

In [108]:
repr

{'lc': 1,
 'type': 'not_implemented',
 'id': ['langchain_core', 'runnables', 'base', 'RunnableSequence'],
 'repr': "{\n  context: ChatPromptTemplate(input_variables=['question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='You are an AI language model assistant. Your task is to generate five \\ndifferent versions of the given user question to retrieve relevant documents from a vector \\ndatabase. By generating multiple perspectives on the user question, your goal is to help\\nthe user overcome some of the limitations of the distance-based similarity search. \\nProvide these alternative questions separated by newlines. Original question: {question}'))])\n           | ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x308f7a490>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x3424bcb90>, root_client=<openai.OpenAI object at 0x177459690>, root_async_client=<openai.AsyncOpenAI obj

In [104]:
string_representation = dumps(final_rag_chain, pretty=True)
print(string_representation[:500])

{
  "lc": 1,
  "type": "not_implemented",
  "id": [
    "langchain_core",
    "vectorstores",
    "base",
    "VectorStoreRetriever"
  ],
  "repr": "VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x111af9090>)",
  "name": "VectorStoreRetriever"
}


In [109]:
import json

with open("../../artifacts/chain.json", "w") as fp:
    json.dump(repr, fp)

In [120]:
with open("../../artifacts/chain.json", "r") as fp:
    chain_dict = json.load(fp)

In [121]:
chain_dict

{'lc': 1,
 'type': 'not_implemented',
 'id': ['langchain_core', 'runnables', 'base', 'RunnableSequence'],
 'repr': "{\n  context: ChatPromptTemplate(input_variables=['question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='You are an AI language model assistant. Your task is to generate five \\ndifferent versions of the given user question to retrieve relevant documents from a vector \\ndatabase. By generating multiple perspectives on the user question, your goal is to help\\nthe user overcome some of the limitations of the distance-based similarity search. \\nProvide these alternative questions separated by newlines. Original question: {question}'))])\n           | ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x308f7a490>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x3424bcb90>, root_client=<openai.OpenAI object at 0x177459690>, root_async_client=<openai.AsyncOpenAI obj