Import modules

In [1]:
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Weaviate
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

Load Documents

In [2]:
loader = PyPDFLoader('./data/Reading 2 Text Analytics for Beginners using NLTK_240116_161801.pdf')
data = loader.load()

In [3]:
len(data)

5

In [4]:
text_gen = ''
for page in data:
    text_gen += page.page_content
text_gen

'Text Analytics for Beginners using NLTK \nReference: https://www.datacamp.com/community/tutor ials/text-analytics-beginners-nltk \nIn today\'s area of internet and online services, da ta is generating at incredible speed and amount. \nGenerally, Data analyst, engineer, and scientists a re handling relational or tabular data. These \ntabular data columns have either numerical or categ orical data. Generated data has a variety of \nstructures such as text, image, audio, and video. O nline activities such as articles, website text, \nblog posts, social media posts are generating unstr uctured textual data. Corporate and business \nneed to analyze textual data to understand customer  activities, opinion, and feedback to \nsuccessfully derive their business. To compete with  big textual data, text analytics is evolving at a \nfaster rate than ever before. \nIn this tutorial, you are going to cover the follow ing topics: \n\uf0b7 Text Analytics and NLP \n\uf0b7 Compare Text Analytics, NLP a

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 5
)

In [6]:
chunks_gen = text_splitter.split_text(text_gen)
len(chunks_gen)

8

In [7]:
document_gen = [Document(page_content=t) for t in chunks_gen]
document_gen[0]

Document(page_content="Text Analytics for Beginners using NLTK \nReference: https://www.datacamp.com/community/tutor ials/text-analytics-beginners-nltk \nIn today's area of internet and online services, da ta is generating at incredible speed and amount. \nGenerally, Data analyst, engineer, and scientists a re handling relational or tabular data. These \ntabular data columns have either numerical or categ orical data. Generated data has a variety of \nstructures such as text, image, audio, and video. O nline activities such as articles, website text, \nblog posts, social media posts are generating unstr uctured textual data. Corporate and business \nneed to analyze textual data to understand customer  activities, opinion, and feedback to \nsuccessfully derive their business. To compete with  big textual data, text analytics is evolving at a \nfaster rate than ever before. \nIn this tutorial, you are going to cover the follow ing topics: \n\uf0b7 Text Analytics and NLP \n\uf0b7 Compare 

Store doc in vector database

In [8]:
embedding_model = HuggingFaceBgeEmbeddings(model_name='BAAI/bge-small-en-v1.5')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [10]:
vector_store = Weaviate.from_documents(document_gen, 
                                    embedding_model, 
                                    weaviate_url = 'http://localhost:8080'
)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [11]:
retriever = vector_store.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 2}
)

In [12]:
query = 'What is NLTK'

In [40]:
docs = vector_store.similarity_search(query)
print(docs[0].page_content)

o Text Analysis Operations using NLTK 
o Tokenization 
o Stopwords 
o Lexicon Normalization such as Stemming and Lemmatiz ation  
o POS Tagging 
Text Analysis Operations using NLTK 
NLTK is a powerful Python package that provides a s et of diverse natural languages algorithms. 
It is free, opensource, easy to use, large communit y, and well documented. NLTK consists of the 
most common algorithms such as tokenizing, part-of- speech tagging, stemming, sentiment 
analysis, topic segmentation, and named entity reco gnition. NLTK helps the computer to 
analysis, preprocess, and understand the written te xt. 
Tokenization 
Tokenization is the first step in text analytics. T he process of breaking down a text paragraph into 
smaller chunks such as words or sentence is called Tokenization. Token is a single entity that is 
building blocks for sentence or paragraph. 
Sentence Tokenization 
Sentence tokenizer breaks text paragraph into sente nces. 
from nltk.tokenize import sent_tokenize


Create LLM Chain

In [14]:
llm = Ollama(model='mistral:7b-instruct-q4_K_M', temperature=0.2)

In [15]:
prompt_template = """
   ### [INST]
   Instruction: You are an expert at answering NLP questions.
   Here is context to help: {context}
   ##QUESTION:
   {question}
   [/INST]
"""

In [16]:
prompt = PromptTemplate(
    input_variables=['context', 'question'],
    template=prompt_template
)
prompt

PromptTemplate(input_variables=['context', 'question'], template='\n   ### [INST]\n   Instruction: You are an expert at answering NLP questions.\n   Here is context to help: {context}\n   ##QUESTION:\n   {question}\n   [/INST]\n')

In [17]:
llm.invoke(query)

'NLTK stands for Natural Language Toolkit. It is a popular Python library used for natural language processing (NLP) tasks such as tokenization, stemming, tagging, parsing, and classification. NLTK provides a wide range of tools and resources for working with text data, including pre-trained models, corpora, and utilities for data cleaning and preprocessing. It is widely used in academia and industry for tasks such as sentiment analysis, machine translation, and information extraction.'

RAG Chain

In [18]:
rag_chain = (
    {'context': retriever, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [19]:
rag_chain.invoke(query)

' NLTK stands for Natural Language Toolkit. It is a powerful Python package that provides a set of diverse natural language algorithms. NLTK is free, opensource, easy to use, has a large community, and is well documented. NLTK consists of the most common algorithms such as tokenizing, part-of-speech tagging, stemming, sentiment analysis, topic segmentation, and named entity recognition. NLTK helps computers analyze, preprocess, and understand written text.'

Conversation Memory

In [26]:
from langchain.schema import format_document
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.runnables import RunnableParallel, RunnableLambda
from langchain.memory import ConversationBufferMemory

from operator import itemgetter

In [21]:
standalone_llm = Ollama(model='mistral:7b-instruct-q4_K_M', temperature=0.0, repeat_penalty=1.1)
response_llm = Ollama(model='mistral:7b-instruct-q4_K_M', temperature=0.2, repeat_penalty=1.1)

In [22]:
from langchain_core.prompts.chat import ChatPromptTemplate
_template = """
[INST] 
Given the following conversation and a follow up question, 
rephrase the follow up question to be a standalone question, in its original language, 
that can be used to query a Weaviate index. This query will be used to retrieve documents with additional context.

Let me share a couple examples.

If you do not see any chat history, you MUST return the "Follow Up Input" as is:
```
Chat History:
Follow Up Input: How is Lawrence doing?
Standalone Question:
How is Lawrence doing?
```

If this is the second question onwards, you should properly rephrase the question like this:
```
Chat History:
Human: How is Lawrence doing?
AI: 
Lawrence is injured and out for the season.
Follow Up Input: What was his injury?
Standalone Question:
What was Lawrence's injury?
```

Now, with those examples, here is the actual chat history and input question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:
[your response here]
[/INST] 
"""

STANDALONE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [24]:
# Instantiate ConversationBufferMemory
memory = ConversationBufferMemory(
    return_messages=True, output_key='answer', input_key='question'
)

In [27]:
# First load memory to access chat history
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter('history')
)

In [28]:
# Define the standalone question step to process the question and chat history
standalone_question = {
    'standalone_question': {
        'question': lambda x: x['question'],
        'chat_history': lambda x: get_buffer_string(x['chat_history'])
    }
    | STANDALONE_QUESTION_PROMPT
}

In [45]:
# Finally, output the result of the CONDENSE_QUESTION_PROMPT
output_prompt = {
    'standalone_question_prompt_result': itemgetter('standalone_question')
}

In [46]:
# Combine to a final chain
standalone_query_generation_prompt = loaded_memory | standalone_question | output_prompt

In [47]:
inputs = {'question': 'What is NLTK'}
memory.save_context(inputs, {'answer': 'NLTK is a library that performs NLP tasks'})

In [48]:
inputs = {'question': 'What is it?'}
standalone_query_generation_prompt.invoke(inputs)['standalone_question_prompt_result']

StringPromptValue(text='\n[INST] \nGiven the following conversation and a follow up question, \nrephrase the follow up question to be a standalone question, in its original language, \nthat can be used to query a Weaviate index. This query will be used to retrieve documents with additional context.\n\nLet me share a couple examples.\n\nIf you do not see any chat history, you MUST return the "Follow Up Input" as is:\n```\nChat History:\nFollow Up Input: How is Lawrence doing?\nStandalone Question:\nHow is Lawrence doing?\n```\n\nIf this is the second question onwards, you should properly rephrase the question like this:\n```\nChat History:\nHuman: How is Lawrence doing?\nAI: \nLawrence is injured and out for the season.\nFollow Up Input: What was his injury?\nStandalone Question:\nWhat was Lawrence\'s injury?\n```\n\nNow, with those examples, here is the actual chat history and input question.\nChat History:\nHuman: What is NLTK\nAI: NLTK is a library that performs NLP tasks\nHuman: Wha

In [33]:
standalone_query_generation_chain = (
    loaded_memory
    | {
        "question": lambda x: x['question'],
        'chat_history': lambda x: get_buffer_string(x['chat_history'])
    }
    | STANDALONE_QUESTION_PROMPT
    | standalone_llm
)

In [35]:
inputs = {'question': 'What is it?'}
standalone_query_generation_chain.invoke(inputs)

'What is NLTK?'

In [36]:
template = """
    [INST] 
    Answer the question based only on the following context:
    {context}

    Question: {standalone_question}
    [/INST] 
"""

In [38]:
RESPONSE_PROMPT = ChatPromptTemplate.from_template(template=template)
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template='{page_content}')

In [42]:
def combine_documents(document_gen, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator='\n\n'):
    doc_strings = [format_document(doc, document_prompt) for doc in document_gen]
    return document_separator.join(doc_strings)

In [43]:
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter('history')
)

In [51]:
standalone_question = {
    'standalone_question': {
        'question': lambda x: x['question'],
        'chat_history': lambda x: get_buffer_string(x['chat_history'])
    }
    | CONDENSE_QUESTION_PROMPT
    | standalone_llm
}

In [55]:
retrieved_documents = {
    "docs": itemgetter("standalone_question") | retriever,
    "standalone_question": lambda x: x["standalone_question"],
}

# Now we construct the inputs for the final prompt
final_inputs = {
    "context": lambda x: combine_documents(x["docs"]),
    "standalone_question": itemgetter("standalone_question"),
}

# And finally, we do the part that returns the answers
answer = {
    "answer": final_inputs | output_prompt | response_llm,
    "standalone_question": itemgetter("standalone_question"),
    "context": final_inputs["context"]
}
# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer