In [10]:
# %pip install -r ../requirements.txt
# %pip install --quiet --upgrade  langchain langchain-community langchainhub gpt4all chromadb bs4 torch transformers
# !pip freeze >> ../requirements.txt

from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import OllamaEmbeddings, GPT4AllEmbeddings, BedrockEmbeddings, HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import LlamaCpp, Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnablePick

from chromadb.errors import InvalidDimensionException

In [46]:
import os

from langchain_community.document_loaders import TextLoader

BASE_DIR = '/home/raj/nlp/cmu-rag/data/documents/combined_txt_files_length_normalized/'
# relevant_files = ['l']

docs = []
for file in os.listdir(BASE_DIR):
    # print(file)
    if "schedule" in file:
        continue
    print("Using file :", file)
    loader = TextLoader(BASE_DIR + file)
    docs.extend(loader.load())

print("Splitting Text")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
all_splits = text_splitter.split_documents(docs)
print("Splitting Text Done")

try:
    print("Creating Chroma")   
    vectorstore = Chroma.from_documents(documents=all_splits, embedding=OllamaEmbeddings())
except InvalidDimensionException:
    print("Deleting Chroma")
    Chroma().delete_collection()
    print("Creating Chroma afresh")
    vectorstore = Chroma.from_documents(documents=all_splits, embedding=OllamaEmbeddings())
print("Done Creating Chroma")

Using file : Buggy News_part_0
Using file : history_of_cmu_part_0
Using file : program_handbooks_part_5000
Using file : lti_papers_metadata_part_0
Using file : program_handbooks_part_0
Using file : academic_calendars_part_0
Using file : About Scottie_part_0
Using file : Tartan Facts_part_0
Using file : lti_faculty_part_0
Using file : history_of_scs_part_0
Using file : Kiltie Band_part_0
Using file : lti_programs_part_0
Splitting Text
Splitting Text Done
Creating Chroma
Done Creating Chroma


In [47]:
retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

prompt = PromptTemplate.from_template(
    "Answer questions on CMU from these documents"
)

llm = Ollama(model="llama2")

rag_prompt_llama = hub.pull("rlm/rag-prompt-llama")
prompt = rag_prompt_llama
# print(prompt.messages)

qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [48]:
questions = ['Whom should I contact for additional info on the MCDS program?', 'How many semesters does the MS AI program generally consist of ?']

ex = qa_chain.invoke(questions)

In [50]:
print(ex)
type(ex)

For the first question, "Whom should I contact for additional info on the MCDS program?", you can reach out to the McMaster University Graduate Studies office or visit their website for more information. They have a dedicated team for handling inquiries and can provide you with the most up-to-date and accurate information about the program.

For the second question, "How many semesters does the MS AI program generally consist of?", the answer is generally two to three semesters, depending on the student's academic background and the course load they take on. It's best to consult with the program advisor or check the university's website for the most accurate information.


str

In [None]:
db = Chroma(persist_directory="./chroma_db", embedding_function=OllamaEmbeddings())