In [1]:
MODEL = "llama2"
print(MODEL)

llama2


In [2]:
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings

model = Ollama(model=MODEL)
embeddings = OllamaEmbeddings(model=MODEL)


In [3]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

In [4]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question")

'\nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Here is some context\n\nQuestion: Here is a question\n'

In [5]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("iitg_dataset.pdf")
pages = loader.load_and_split()
pages

[Document(page_content="IITG was Established in 1994. Nestled along the picturesque northern banks of the Brahmaputra River in \nGuwahati, Assam, the institute offers a wide array of undergraduate, postgraduate, and doctoral \nprograms across engineering, science, and humanities disciplines. Its sprawling 700-acre campus boasts \nstate-of-the-art infrastructure, including academic buildings, research centers, hostels, sports facilities, \nand student activity centers, providing an enriching environment for learning and innovation.  \n  \nBeyond academics, IIT Guwahati fosters a vibrant student life with numerous clubs, societies, and cultural\nevents that cater to a diverse range of interests, from music and dance to entrepreneurship initiatives. \nThe institute's emphasis on research and innovation has led to significant contributions in fields such as \nnanotechnology, biotechnology, renewable energy, and environmental science, with a strong emphasis \non interdisciplinary collaborat

In [6]:
from langchain_community.vectorstores import DocArrayInMemorySearch
    
vectorstore_RAG = DocArrayInMemorySearch.from_documents(pages, embedding=embeddings)



In [7]:
retriever = vectorstore_RAG.as_retriever()

In [8]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [9]:
questions = [
        "List all the hostels in IITG?"
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print()

Question: List all the hostels in IITG?
Answer: Based on the provided document, there are several mentions of hostels in IIT Guwahati. Here is a list of hostels mentioned:

1. Hostel-1 (Core1)
2. Hostel-2 (Core2)
3. Hostel-3 (Core3)
4. Hostel-4 (View Point)
5. Siang Hostel (restaurant located outside the library)
6. Dihing Hostel (winner of Spardha competition)
7. Manas Hostel (has the most visitors per day)
8. Khoka Hostel (location for student gatherings and celebrations)

Please note that this list is based on the information provided in the document and may not be exhaustive or up-to-date.



In [10]:
import spacy

nlp = spacy.load("/home/raone/LLM Project/NER/model-best")

all_data = []

for i in range(len(pages)):
    doc = nlp(pages[i].page_content)
    for ent in doc.ents:
        data = " is ".join([ent.text, ent.label_])  # Join the elements with " is "
        all_data.append([data])  # Append the formatted data as a list to all_data

all_data

from typing import AsyncIterator, Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document


class CustomDocumentLoader(BaseLoader):
    """An example document loader that reads a file line by line."""

    def __init__(self, file_path: str) -> None:
        """Initialize the loader with a file path.

        Args:
            file_path: The path to the file to load.
        """
        self.file_path = file_path

    def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
        """A lazy loader that reads a file line by line.

        When you're implementing lazy load methods, you should use a generator
        to yield documents one by one.
        """
        with open(self.file_path, encoding="utf-8") as f:
            line_number = 0
            for line in f:
                yield Document(
                    page_content=line,
                    metadata={"line_number": line_number, "source": self.file_path},
                )
                line_number += 1


with open("./NER.txt", "w", encoding="utf-8") as f:
    #quality_content = "meow meow🐱 \n meow meow🐱 \n meow😻😻"
    f.write(str(all_data))

CustomDocumentLoader = CustomDocumentLoader("./NER.txt")
CustomDocumentLoader.load()

pages += CustomDocumentLoader.load()
pages


[Document(page_content="IITG was Established in 1994. Nestled along the picturesque northern banks of the Brahmaputra River in \nGuwahati, Assam, the institute offers a wide array of undergraduate, postgraduate, and doctoral \nprograms across engineering, science, and humanities disciplines. Its sprawling 700-acre campus boasts \nstate-of-the-art infrastructure, including academic buildings, research centers, hostels, sports facilities, \nand student activity centers, providing an enriching environment for learning and innovation.  \n  \nBeyond academics, IIT Guwahati fosters a vibrant student life with numerous clubs, societies, and cultural\nevents that cater to a diverse range of interests, from music and dance to entrepreneurship initiatives. \nThe institute's emphasis on research and innovation has led to significant contributions in fields such as \nnanotechnology, biotechnology, renewable energy, and environmental science, with a strong emphasis \non interdisciplinary collaborat

In [11]:
from langchain_community.vectorstores import DocArrayInMemorySearch
    
vectorstore_NER = DocArrayInMemorySearch.from_documents(pages, embedding=embeddings)

In [12]:
retriever = vectorstore_NER.as_retriever()

In [13]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [14]:


questions = [
    "List all the hostels in IITG?"
]
for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print()



Question: List all the hostels in IITG?
Answer: Based on the provided document, the following are the hostels in IIT Guwahati:

1. Lohit
2. Dihing.
3. Siang
4. Manas
5. Brahmaputra (mentioned as HOSTEL)

