In [1]:
# Import required modules
from langchain import hub
from langchain.chains import RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain.llms import Ollama
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
import streamlit as st
import os
import time

ModuleNotFoundError: No module named 'langchain'

In [2]:
FILEPATH    = "sample.pdf" 
LOCAL_MODEL = "llama2"
EMBEDDING   = "nomic-embed-text"

In [3]:
loader = PyPDFLoader(FILEPATH)
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500, chunk_overlap=100)
all_splits = text_splitter.split_documents(data)

In [4]:
persist_directory = 'data'

vectorstore = Chroma.from_documents(
    documents=all_splits, 
    embedding=OllamaEmbeddings(model=EMBEDDING),
    persist_directory=persist_directory
)

# vectorstore.persist()

In [5]:
llm = Ollama(base_url="http://localhost:11434",
                                  model=LOCAL_MODEL,
                                  verbose=True,
                                  callback_manager=CallbackManager(
                                      [StreamingStdOutCallbackHandler()])
                                  )

retriever = vectorstore.as_retriever()

In [6]:
template = """ You are a knowledgeable chatbot, here to help with questions of the user. Your tone should be professional and informative.
    
    Context: {context}
    History: {history}

    User: {question}
    Chatbot:
    """
prompt = PromptTemplate(
        input_variables=["history", "context", "question"],
        template=template,
    )

memory = ConversationBufferMemory(
        memory_key="history",
        return_messages=True,
        input_key="question"
    )


In [7]:
qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type='stuff',
            retriever=retriever,
            verbose=True,
            chain_type_kwargs={
                "verbose": True,
                "prompt": prompt,
                "memory": memory,
            }
        )

In [8]:
query = "What clustering method have been implemented"
query += ". Only from this pdf. Keep it short"
# response = qa_chain(query)

In [9]:
qa_chain.invoke({"query": query})



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m You are a knowledgeable chatbot, here to help with questions of the user. Your tone should be professional and informative.
    
    Context: 4.2
Hierarchical
Clustering:
Statistical
methods
such
as
hierarchical
clustering
analyze
data
by
building
clusters
in
a
hierarchy.
This
algorithm
employs
two
strategies,
agglomerative
and
divisive.
K
can
be
exactly
set
in
Hierarchical
Clustering
as
in
K-means,
and
n
is
the
number
of
data
points
such
that
n>K.
The
agglomerative
Hierarchical
Clustering
gathers
data
starting
with
n
clusters
and
continues
until
K
clusters
are
obtained.
The
divisive
begins
with
a
single
cluster
and
subsequently
splits
into
K
clusters
based
on
their
commonalities.
4.3
Affinity
Propagation
Clustering:
The
most
popular
clustering
technique
in
data
science
is
affinity
propagation.
Neither
the


{'query': 'What clustering method have been implemented. Only from this pdf. Keep it short',
 'result': 'Based on the provided PDF, the following clustering methods have been implemented:\n\n1. Hierarchical Clustering: This method builds a hierarchy of clusters by merging or splitting them repeatedly until a desired number of clusters is obtained.\n2. Affinity Propagation Clustering: This method propagates similarities between data points to group them into clusters. It does not require the number of clusters as input, and it benefits from not being dependent on the luck of the first cluster centroid pick.\n3. Responsibility Matrix (R): This method calculates a responsibility matrix for each cell using the following formula, and then updates off-diagonal elements using the following equation.'}