## Langchain

In [None]:
%pip install -qU pypdf
%pip install langchain_community
%pip install -qU langchain-text-splitters
%pip install -qU langchain-huggingface
%pip install -qU "langchain-astradb>=0.3.3"
%pip install -qU langchain-mistralai

In [None]:
#import statements
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_mistralai import MistralAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from uuid import uuid4
from langchain_mistralai import ChatMistralAI
from langchain_astradb import AstraDBVectorStore
import getpass
import os

In [None]:
pdf_docs="/content/Lecun2015.pdf"

In [None]:
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

In [None]:
model = ChatMistralAI(
            model="mistral-large-latest",
            temperature=0.7,
            max_retries=2,)

In [None]:
embeddings_model = MistralAIEmbeddings(model="mistral-embed")

In [None]:
def get_pdf_text(pdf_docs):
    loader = PyPDFLoader(pdf_docs)
    pages = []
    for page in loader.load():
        pages.append(page)
    return pages

In [None]:
p = get_pdf_text(pdf_docs)

In [None]:
def get_chunks(pages):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""])

    chunks_list=[]
    for page in pages:
        chunks = text_splitter.split_text(page.page_content)
        chunks_list.extend(chunks)
    from langchain.schema import Document
    splitted_text = [Document(page_content=doc) for doc in chunks_list]
    return splitted_text

In [None]:
s = get_chunks(p)
type(s[0])

In [None]:
def get_vectorstore(embeddings_model, splitted_text):

    vector_store = AstraDBVectorStore(
    collection_name="pdf_store",
    embedding=embeddings_model,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    )
    uuids = [str(uuid4()) for _ in range(len(splitted_text))]

    vector_store.add_documents(documents=splitted_text, ids=uuids)

    return vector_store

In [None]:
def get_qa_chain(model, vector_store, user_query):
    retriever = vector_store.as_retriever()

    contextualize_q_system_prompt = """Given a chat history and the latest user question \
    which might reference context in the chat history, formulate a standalone question \
    which can be understood without the chat history. Do NOT answer the question, \
    just reformulate it if needed and otherwise return it as is."""
    contextualize_q_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    history_aware_retriever = create_history_aware_retriever(
        model, retriever, contextualize_q_prompt
    )


    ### Answer question ###
    qa_system_prompt = """You are an assistant for question-answering tasks. \
    Use the following pieces of retrieved context to answer the question. \
    If you don't know the answer, just say that you don't know. \
    Use three sentences maximum and keep the answer concise.\

    {context}"""
    qa_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", qa_system_prompt),
            MessagesPlaceholder("chat_history"),
            ("human", "{input}"),
        ]
    )
    question_answer_chain = create_stuff_documents_chain(model, qa_prompt)

    rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


    ### Statefully manage chat history ###
    store = {}


    def get_session_history(session_id: str) -> BaseChatMessageHistory:
        if session_id not in store:
            store[session_id] = ChatMessageHistory()
        return store[session_id]


    conversational_rag_chain = RunnableWithMessageHistory(
        rag_chain,
        get_session_history,
        input_messages_key="input",
        history_messages_key="chat_history",
        output_messages_key="answer",
    )
    return conversational_rag_chain

In [None]:
v =get_vectorstore(embeddings_model, s)

In [None]:
user_query = "define RNN"

In [None]:
g = get_qa_chain(model, v, user_query)
g.invoke(
    {"input": user_query},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]