<a href="https://colab.research.google.com/github/SJTheGreat06/anees-bot-rag/blob/main/anees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing all the requirements

In [None]:
!pip install -q langchain langchain-community langchain-core langchain-groq pypdf gpt4all faiss-cpu

# Importing APIs and it's keys

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.environ.get("GROQ_API_KEY")


# Reading TXT

In [3]:
# Text based data Ingestion
from langchain_community.document_loaders import TextLoader
loader = TextLoader("data/anees_dataset.txt")
text_docs = loader.load()

# Scraping Anees School Website

In [None]:
# Web Based Data Ingestion
from langchain_community.document_loaders import WebBaseLoader

# Load, chunk, and Index the contents of a Webpage
loader = WebBaseLoader(
    ["https://aneesschool.com/life-at-anees/", "https://aneesschool.com/aims-objectives/", "https://aneesschool.com/vision-mission/", "https://aneesschool.com/academic-building/", "https://aneesschool.com/anees-core-team/", "https://aneesschool.com/message-from-ceo-cum-chairman/", "https://aneesschool.com/the-formative-years/", "https://aneesschool.com/the-junior-school/", "https://aneesschool.com/the-middle-school/", "https://aneesschool.com/the-senior-school/", "https://aneesschool.com/personality-development/", "https://aneesschool.com/creative-skills/", "https://aneesschool.com/important-tips/", "https://aneesschool.com/student-participation/", "https://aneesschool.com/digital-campus/", "https://aneesschool.com/health-hygiene/", "https://aneesschool.com/transport-safety/", "https://aneesschool.com/auditorium/", "https://aneesschool.com/computer-lab/", "https://aneesschool.com/library/", "https://aneesschool.com/art-room/", "https://aneesschool.com/science-laboratories/", "https://aneesschool.com/sports/", "https://aneesschool.com/smart-spacious-classes/"]
)
web_documents = loader.load()

# Loading PDF and its related data

In [7]:
# PDF Data Ingestion
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("data/Prospectus.pdf")
docs = loader.load()

# Chuncking all the data to meet the context length of the LLM Models

In [8]:
# Converting Text to chuncks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(text_docs)

In [9]:
# Converting PDF text to chuncks
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
documents = text_splitter.split_documents(docs)

In [10]:
# Converting Web text to chuncks
from langchain_text_splitters import RecursiveCharacterTextSplitter
web_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
web = web_splitter.split_documents(web_documents)

In [22]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import GPT4AllEmbeddings
dataBase1 = FAISS.from_documents(documents, GPT4AllEmbeddings())
dataBase2 = FAISS.from_documents(web, GPT4AllEmbeddings())
dataBase3 = FAISS.from_documents(text, GPT4AllEmbeddings())

In [None]:
query = "Fees"
result = dataBase3.similarity_search(query)
result[0].page_content

In [32]:
# Importing LLM and Implementing
from langchain_groq import ChatGroq
# Load Groq Gemma2
llm = ChatGroq(model="gemma2-9b-it")

# Creating Master prompt

In [47]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(
    """
    Answer the question asked by the user only in the provided context.
    Think step by step before answering questions and provide a detailed answer.
    I will tip you $1000 if the user finds the answer helpful

    Character: AneesBot, a formal and polite AI assistant employed by Anees School.

    Task:
    Respond to user inquiries about Anees School and its facilities in a professional, informative, and helpful manner.
    Provide detailed and accurate information, adhering to the provided context.
    Maintain a polite and courteous tone throughout the interaction, addressing all interactions with prospective parents beginning with "Dear Parent," followed by the appropriate information.
    Aim to keep responses within 500 characters. If necessary to provide a comprehensive and informative response, the character limit may be exceeded, but only when absolutely required and by a reasonable margin.

    Guidelines:
    Professionalism: Maintain a professional and courteous demeanor, avoiding slang or informal language.
    Accuracy: Ensure the accuracy and relevance of all information provided.
    Conciseness: Deliver information in a clear, concise, and easy-to-understand manner.
    Context Awareness: If a user query falls outside the scope of school-related information, politely remind them of the context and request a rephrased question.
    For example: "Dear Parents, to best assist you, please limit your inquiries to questions about Anees School and its facilities. Could you please rephrase your question?"

    <context>
    {context}
    </context>
    Question: {input}
    """
)

In [35]:
# Create stuff document chain
from langchain.chains.combine_documents import create_stuff_documents_chain
documentChain = create_stuff_documents_chain(llm, prompt)

In [36]:
retriever1 = dataBase1.as_retriever()
retriever2 = dataBase2.as_retriever()
retriever3 = dataBase3.as_retriever()

In [None]:
from langchain.retrievers import MergerRetriever
merger_retriever = MergerRetriever(retrievers=[retriever1, retriever2, retriever3])

In [38]:
# Retriever Chain
from langchain.chains import create_retrieval_chain
retrievalChain = create_retrieval_chain(merger_retriever, documentChain)

# Output using LLM on RAG based context

In [None]:
result = retrievalChain.invoke({"input":""})
print(result['answer'])