Import modules

In [1]:
from langchain_community.llms import Ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Weaviate
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

Load Documents

In [2]:
loader = PyPDFLoader('../data/Reading 2 Text Analytics for Beginners using NLTK_240116_161801.pdf')
data = loader.load()

In [3]:
len(data)

5

In [4]:
text_gen = ''
for page in data:
    text_gen += page.page_content

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 5
)

In [6]:
chunks_gen = text_splitter.split_text(text_gen)
len(chunks_gen)

8

In [7]:
document_gen = [Document(page_content=t) for t in chunks_gen]
document_gen[0]

Document(page_content="Text Analytics for Beginners using NLTK \nReference: https://www.datacamp.com/community/tutor ials/text-analytics-beginners-nltk \nIn today's area of internet and online services, da ta is generating at incredible speed and amount. \nGenerally, Data analyst, engineer, and scientists a re handling relational or tabular data. These \ntabular data columns have either numerical or categ orical data. Generated data has a variety of \nstructures such as text, image, audio, and video. O nline activities such as articles, website text, \nblog posts, social media posts are generating unstr uctured textual data. Corporate and business \nneed to analyze textual data to understand customer  activities, opinion, and feedback to \nsuccessfully derive their business. To compete with  big textual data, text analytics is evolving at a \nfaster rate than ever before. \nIn this tutorial, you are going to cover the follow ing topics: \n\uf0b7 Text Analytics and NLP \n\uf0b7 Compare 

Store doc in vector database

In [8]:
embedding_model = HuggingFaceBgeEmbeddings(model_name='BAAI/bge-small-en-v1.5')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [10]:
vector_store = Weaviate.from_documents(document_gen, 
                                    embedding_model, 
                                    weaviate_url = 'http://localhost:8080'
)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [11]:
retriever = vector_store.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 2}
)

In [12]:
query = 'What is NLTK'

In [40]:
docs = vector_store.similarity_search(query)
print(docs[0].page_content)

o Text Analysis Operations using NLTK 
o Tokenization 
o Stopwords 
o Lexicon Normalization such as Stemming and Lemmatiz ation  
o POS Tagging 
Text Analysis Operations using NLTK 
NLTK is a powerful Python package that provides a s et of diverse natural languages algorithms. 
It is free, opensource, easy to use, large communit y, and well documented. NLTK consists of the 
most common algorithms such as tokenizing, part-of- speech tagging, stemming, sentiment 
analysis, topic segmentation, and named entity reco gnition. NLTK helps the computer to 
analysis, preprocess, and understand the written te xt. 
Tokenization 
Tokenization is the first step in text analytics. T he process of breaking down a text paragraph into 
smaller chunks such as words or sentence is called Tokenization. Token is a single entity that is 
building blocks for sentence or paragraph. 
Sentence Tokenization 
Sentence tokenizer breaks text paragraph into sente nces. 
from nltk.tokenize import sent_tokenize


Create LLM Chain

In [14]:
llm = Ollama(model='mistral:7b-instruct-q4_K_M', temperature=0.2)

In [15]:
prompt_template = """
   ### [INST]
   Instruction: You are an expert at answering NLP questions.
   Here is context to help: {context}
   ##QUESTION:
   {question}
   [/INST]
"""

In [16]:
prompt = PromptTemplate(
    input_variables=['context', 'question'],
    template=prompt_template
)
prompt

PromptTemplate(input_variables=['context', 'question'], template='\n   ### [INST]\n   Instruction: You are an expert at answering NLP questions.\n   Here is context to help: {context}\n   ##QUESTION:\n   {question}\n   [/INST]\n')

In [17]:
llm.invoke(query)

'NLTK stands for Natural Language Toolkit. It is a popular Python library used for natural language processing (NLP) tasks such as tokenization, stemming, tagging, parsing, and classification. NLTK provides a wide range of tools and resources for working with text data, including pre-trained models, corpora, and utilities for data cleaning and preprocessing. It is widely used in academia and industry for tasks such as sentiment analysis, machine translation, and information extraction.'

RAG Chain

In [18]:
rag_chain = (
    {'context': retriever, 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [19]:
rag_chain.invoke(query)

' NLTK stands for Natural Language Toolkit. It is a powerful Python package that provides a set of diverse natural language algorithms. NLTK is free, opensource, easy to use, has a large community, and is well documented. NLTK consists of the most common algorithms such as tokenizing, part-of-speech tagging, stemming, sentiment analysis, topic segmentation, and named entity recognition. NLTK helps computers analyze, preprocess, and understand written text.'