In [1]:
import os 
import pandas as pd

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI

from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

In [2]:
from dotenv import load_dotenv
# Load environment variables
load_dotenv('../../.env')

True

In [3]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens = 1000)

In [4]:
df_articles = pd.read_csv('../../data/df_articles.csv').dropna()
df_articles['content'] = df_articles['title'] + df_articles['content']

df_articles_analytics = pd.read_csv('../../data/df_articles_analytics.csv').dropna()
df_articles_analytics['content'] = df_articles_analytics['title'] + df_articles_analytics['content']

documents = df_articles['content'].to_list() + df_articles_analytics['content'].to_list()
with open('../../data/courses.txt', 'r') as f:
    documents += list(f.read().split('\n\n\n\n'))

In [5]:
import os
import PyPDF2

def load_pdfs_from_folder(folder_path="../../data/pdf/"):
    documents = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)

            with open(file_path, "rb") as f:
                pdf_reader = PyPDF2.PdfReader(f)
                text = ""
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + " "

            documents.append(text)

    return documents

In [6]:
pdf_docs = load_pdfs_from_folder()

In [7]:
len(pdf_docs)

17

In [9]:
documents += pdf_docs

In [11]:
documents = [doc.replace('\xa0', ' ').replace('\n', ' ') for doc in documents]

In [12]:
len(documents)

559

In [13]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(" ".join(documents))

In [14]:
len(chunks)

37152

In [15]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="DeepPavlov/rubert-base-cased-sentence", model_kwargs={'device': 'cpu'})

db = Chroma.from_texts(chunks, embedding_function, persist_directory="../../artifacts/chroma_db")

  embedding_function = SentenceTransformerEmbeddings(model_name="DeepPavlov/rubert-base-cased-sentence", model_kwargs={'device': 'cpu'})
No sentence-transformers model found with name DeepPavlov/rubert-base-cased-sentence. Creating a new one with mean pooling.


In [16]:
import pickle


with open("../../artifacts/embedding_function.pkl", "wb") as file:
    pickle.dump(embedding_function, file)

In [17]:
retriever = db.as_retriever()

In [18]:
from langchain.prompts import ChatPromptTemplate

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [19]:
from langchain.load import dumps, loads, load

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

In [20]:
question = "–ö–∞–∫–∏–µ –∞–∫—Ü–∏–∏ IT –∫–æ–º–ø–∞–Ω–∏–π —Å–µ–π—á–∞—Å –Ω–∞–∏–±–æ–ª–µ–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã –¥–ª—è –ø—Ä–∏–æ–±—Ä–µ—Ç–µ–Ω–∏—è?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  return [loads(doc) for doc in unique_docs]


12

In [21]:
docs

[Document(page_content='–ú—ã –Ω–µ –∏—Å–∫–ª—é—á–∞–µ–º, —á—Ç–æ —Ç–∞–∫–∞—è —Å—Ç—Ä–∞—Ç–µ–≥–∏—è —Ä–∞–∑–≤–∏—Ç–∏—è –ø—Ä–æ–¥–æ–ª–∂–∏—Ç—Å—è –≤ –¥–∞–ª—å–Ω–µ–π—à–µ–º –∏ –ø–æ–º–æ–∂–µ—Ç –≥—Ä—É–ø–ø–µ —É–≤–µ–ª–∏—á–∏—Ç—å —Å–ø–µ–∫—Ç—Ä –ø—Ä–µ–¥–ª–∞–≥–∞–µ–º—ã—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤ –∏ —É—Å–ª—É–≥. –ö–æ–º–ø–∞–Ω–∏—è –ø–ª–∞–Ω–∏—Ä—É–µ—Ç –¥–æ–±–∏—Ç—å—Å—è —ç—Ç–æ–≥–æ –∑–∞ —Å—á–µ—Ç: –ú–µ–Ω–µ–¥–∂–º–µ–Ω—Ç –ì—Ä—É–ø–ø—ã –ê—Ä–µ–Ω–∞–¥–∞—Ç—ã –ø—Ä–æ–≥–Ω–æ–∑–∏—Ä—É–µ—Ç —Ä–æ—Å—Ç —á–∏—Å–ª–∞ –∑–∞–∫–∞–∑—á–∏–∫–æ–≤ –≤ —à–µ—Å—Ç—å —Ä–∞–∑ –≤ —Å—Ä–µ–¥–Ω–µ—Å—Ä–æ—á–Ω–æ–π –ø–µ—Ä—Å–ø–µ–∫—Ç–∏–≤–µ. –î–ª—è –¥–æ—Å—Ç–∏–∂–µ–Ω–∏—è —ç—Ç–æ–π —Ü–µ–ª–∏ –∫–æ–º–ø–∞–Ω–∏—è –ø–ª–∞–Ω–∏—Ä—É–µ—Ç —Ä–∞—Å—à–∏—Ä–∏—Ç—å –ø–∞—Ä—Ç–Ω–µ—Ä—Å–∫—É—é —Å–µ—Ç—å –∏ –æ—Ç–¥–µ–ª –ø—Ä—è–º—ã—Ö –ø—Ä–æ–¥–∞–∂ –≤–¥–≤–æ–µ. –û—Ç–¥–µ–ª—å–Ω–æ–µ –≤–Ω–∏–º–∞–Ω–∏–µ –ø–æ–ª—É—á–∏—Ç —Å–µ–≥–º–µ–Ω—Ç –º–∞–ª–æ–≥–æ –∏ —Å—Ä–µ–¥–Ω–µ–≥–æ –±–∏–∑–Ω–µ—Å–∞, –¥–ª—è –∫–æ—Ç–æ—Ä–æ–≥–æ –∞–∫—Ç—É–∞–ª—å–Ω—ã –æ–±–ª–∞—á–Ω—ã–µ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∏. –ß—Ç–æ —Å —Ñ–∏–Ω–∞–Ω—Å–∞–º–∏ –∫–æ–º–ø–∞–Ω–∏–∏ –û—Å–Ω–æ–≤–Ω—É—é –≤—ã—

In [22]:
from operator import itemgetter

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

res = final_rag_chain.invoke({"question":question})

print(res)

–ù–∞ –æ—Å–Ω–æ–≤–∞–Ω–∏–∏ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω–æ–≥–æ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞, –º–æ–∂–Ω–æ –≤—ã–¥–µ–ª–∏—Ç—å –Ω–µ—Å–∫–æ–ª—å–∫–æ —Ñ–∞–∫—Ç–æ—Ä–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –¥–µ–ª–∞—é—Ç –∞–∫—Ü–∏–∏ IT-–∫–æ–º–ø–∞–Ω–∏–π –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–º–∏ –¥–ª—è –ø—Ä–∏–æ–±—Ä–µ—Ç–µ–Ω–∏—è:

1. **–†–∞–∑–≤–∏—Ç–∏–µ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤**: –£–≤–µ–ª–∏—á–µ–Ω–∏–µ –¥–æ–ª–∏ —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤ –≤ –æ–±—â–µ–º –æ–±—ä–µ–º–µ –ø—Ä–æ–¥–∞–∂, —á—Ç–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ —Å–∫–∞–∑—ã–≤–∞–µ—Ç—Å—è –Ω–∞ —Ä–µ–Ω—Ç–∞–±–µ–ª—å–Ω–æ—Å—Ç–∏. –ù–∞–ø—Ä–∏–º–µ—Ä, –≤ –ø–µ—Ä–≤–æ–º –ø–æ–ª—É–≥–æ–¥–∏–∏ 2024 –≥–æ–¥–∞ –¥–æ–ª—è —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤ –≤—ã—Ä–æ—Å–ª–∞ –¥–æ 30% —Å –º–µ–Ω–µ–µ 3% –≤ 2020 –≥–æ–¥—É.

2. **–†–æ—Å—Ç –≤–∞–ª–æ–≤–æ–π –º–∞—Ä–∂–∏**: –í–∞–ª–æ–≤–∞—è –ø—Ä–∏–±—ã–ª—å –æ—Ç —Å–æ–±—Å—Ç–≤–µ–Ω–Ω—ã—Ö –ø—Ä–æ–¥—É–∫—Ç–æ–≤ —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç –æ–∫–æ–ª–æ 60%, –≤ —Ç–æ –≤—Ä–µ–º—è –∫–∞–∫ –æ—Ç —Å—Ç–æ—Ä–æ–Ω–Ω–∏—Ö ‚Äî –≤—Å–µ–≥–æ –æ–∫–æ–ª–æ 13%. –≠—Ç–æ —É–∫–∞–∑—ã–≤–∞–µ—Ç –Ω–∞ –±–æ–ª–µ–µ –≤—ã—Å–æ–∫—É—é

In [107]:
from langchain_core.load.serializable import to_json_not_implemented

repr = to_json_not_implemented(final_rag_chain)

In [108]:
repr

{'lc': 1,
 'type': 'not_implemented',
 'id': ['langchain_core', 'runnables', 'base', 'RunnableSequence'],
 'repr': "{\n  context: ChatPromptTemplate(input_variables=['question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='You are an AI language model assistant. Your task is to generate five \\ndifferent versions of the given user question to retrieve relevant documents from a vector \\ndatabase. By generating multiple perspectives on the user question, your goal is to help\\nthe user overcome some of the limitations of the distance-based similarity search. \\nProvide these alternative questions separated by newlines. Original question: {question}'))])\n           | ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x308f7a490>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x3424bcb90>, root_client=<openai.OpenAI object at 0x177459690>, root_async_client=<openai.AsyncOpenAI obj

In [104]:
string_representation = dumps(final_rag_chain, pretty=True)
print(string_representation[:500])

{
  "lc": 1,
  "type": "not_implemented",
  "id": [
    "langchain_core",
    "vectorstores",
    "base",
    "VectorStoreRetriever"
  ],
  "repr": "VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x111af9090>)",
  "name": "VectorStoreRetriever"
}


In [109]:
import json

with open("../../artifacts/chain.json", "w") as fp:
    json.dump(repr, fp)

In [120]:
with open("../../artifacts/chain.json", "r") as fp:
    chain_dict = json.load(fp)

In [121]:
chain_dict

{'lc': 1,
 'type': 'not_implemented',
 'id': ['langchain_core', 'runnables', 'base', 'RunnableSequence'],
 'repr': "{\n  context: ChatPromptTemplate(input_variables=['question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='You are an AI language model assistant. Your task is to generate five \\ndifferent versions of the given user question to retrieve relevant documents from a vector \\ndatabase. By generating multiple perspectives on the user question, your goal is to help\\nthe user overcome some of the limitations of the distance-based similarity search. \\nProvide these alternative questions separated by newlines. Original question: {question}'))])\n           | ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x308f7a490>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x3424bcb90>, root_client=<openai.OpenAI object at 0x177459690>, root_async_client=<openai.AsyncOpenAI obj