SEP 775 Final Project Code (Implementation of AI Teaching Assistant)

Submitted By Group-4 :

Rutvik, Damjibhai Roy – 400490159

In [None]:
# Install neccessary libraries

!pip install langchain==0.1.14
!pip install sentence-transformers==2.6.1
!pip install faiss-cpu==1.8.0
!pip install pdfminer.six==20231228
!pip install llama_cpp_python==0.2.58
!pip install tiktoken==0.6.0
!pip install gdown==5.1.0

# **1. Load Documents (PDF files of SEP 775 Course Material ) and LLM**

In [None]:
# Download llama-2 7B model from gdrive

import gdown

id='1fJPVfJssRO-PXaHKkxMe4BZ2PS4QwXBn'
url = f"https://drive.google.com/uc?id={id}"
output = "llama-2-7b-chat.Q4_K_M.gguf"
gdown.download(url, output)

In [None]:
# Download all pdf files (SEP 775 Course Materials) from gdrive

import gdown

id="1PuOPV-TYxcfxmqwcUqenTSuPlXFVRd-P"
!mkdir '/All_lecture_pdfs'
output='/All_lecture_pdfs'
documents=gdown.download_folder(id=id,output=output, quiet=True)

# **2. Extract Text and Metadata**

In [None]:
import os
import time

# Function to get last modified time of pdf files
def get_modified_time(file):
  ti_m = os.path.getmtime(file)
  m_ti = time.ctime(ti_m)
  t_obj = time.strptime(m_ti)
  # Transforming the time object to a timestamp of ISO 8601 format
  T_stamp = time.strftime("%Y-%m-%d %H:%M:%S", t_obj)
  return T_stamp

def replace_newlines(text):
    # Replace newline and carriage return + line feed characters with spaces
    return text.replace('\n', ' ').replace('\r\n', ' ').replace('\x0c', ' ')

def fix_missing_spaces(text):
    # Split text into sentences
    sentences = text.split('. ')
    # Add space after period for each sentence
    fixed_text = '. '.join(sentence + (' ' if i < len(sentences) - 1 else '') for i, sentence in enumerate(sentences))
    return fixed_text

In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

# Function to extract text from pdf files
def prepare_docs(pdf_docs):
    docs = []
    metadata = []
    content = []

    for pdf in pdf_docs:
      for page_number, page_layout in enumerate(extract_pages(pdf), start=1):
        # Extract text from the page layout
        text = ""
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text += element.get_text()

        text = replace_newlines(text)
        text = fix_missing_spaces(text)
        doc_page = {'Title': pdf.split("\\")[-1] + " Page No: " + str(page_number),
                    'Last_modified_time': get_modified_time(pdf),
                    'Content': text,
                    'Source': "empty_url"}
        docs.append(doc_page)

    for doc in docs:
        content.append(doc["Content"])
        metadata.append({
            "Title": doc["Title"],
            "Last_modified_time": doc["Last_modified_time"],
            "Source": doc["Source"]
        })

    print("Content and metadata are extracted from the documents")

    return content, metadata

In [None]:
# Split extracted text into chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

def get_text_chunks(content, metadata):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512,chunk_overlap=15)
    split_docs = text_splitter.create_documents(content, metadatas=metadata)
    print(f"Documents are split into {len(split_docs)} passages")

    return split_docs

# **3. Vector Database**

In [None]:
# Create Vector database

from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

# Embedding model
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

def ingest_into_vectordb(split_docs):
    embeddings = embedding_model
    db = FAISS.from_documents(split_docs, embeddings)
    DB_FAISS_PATH = 'vectorstore/db_faiss'
    db.save_local(DB_FAISS_PATH)
    print("Vector database is created")
    return db

In [None]:
content, metadata = prepare_docs(documents)

In [None]:
split_docs = get_text_chunks(content, metadata)

In [None]:
vectordb=ingest_into_vectordb(split_docs)

# **4. Database Retriever**

In [None]:
# Set retriver

retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [None]:

# Test a user query

user_query="What is RNN?"
query_embedding = embedding_model.embed_query(user_query)
print(query_embedding)

In [None]:
# Check retriver

docs = retriever.get_relevant_documents(user_query)

for i,doc in enumerate(docs):
  print(f"Retrived Chunk - {i+1} :",docs[i].page_content)
  print(" ")
  print("Title:",doc.metadata["Title"])
  print("Last_modified_time:",doc.metadata["Last_modified_time"])
  print("Source:",doc.metadata["Source"])
  print("-"*40)
  print(" ")

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **5. RAG Conversational Chain**

In [None]:
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# callback manager for word to word streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Load Llama-2 Model from directory

llama_llm = LlamaCpp(
model_path="llama-2-7b-chat.Q4_K_M.gguf",
n_gpu_layers=15000, 
n_threads=6,     # Number of CPU core
n_batch=512,
temperature=0.7,
f16_kv=True,
max_tokens=512,
top_p=0.95,
callback_manager=callback_manager,
n_ctx=4096,
verbose=True,
streaming=True)

5.1. Prompt Template and retrieved document's citation (Define a fuction to cite retrived chunks' sources with LLM's response)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferWindowMemory

# Set up conversation memory to save chat history
memory=ConversationBufferWindowMemory(k=2,memory_key="chat_history", return_messages=True, input_key="question")

# Function to set up langchain chain for sequence of call
def get_conversational_chain():

    prompt_template ="""
    You are a helpful Teaching Assistant of the McMaster University and your name is 'Mac AI Assistant'.\n
    This is the conversation between a student and and 'Mac AI Assistant". your job is to answer the student's question.\n
    The question can be a new question or follow up. So, you must check the chat histroy given below before you answer the question.\n
    You must answer student's the question based on only context given below.\n
    If the question can not be answered using the information provided in the context, must answer with I don't know, don't try to make up an answer.\n
    Give answers in natural form, without giving context as of what you're doing internally.\n
    Use three sentences maximum. Keep the answer as concise as possible.\n
    If user question is more general for eaxmple 'Hi', 'Hi there!, 'Thanks', or 'How are you!', then asnwer them like a personal assistant of an user\n
    Always begin your answer with this dialog format:\n 'Mac AI Assistant: <your_Answer>' \n\n

    context:\n{context}\n

    Student's Question: \n{question}\n

    Chat history: \n{chat_history}\n

    Answer: """

    # Langchain Prompt template to configure prompt variable
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question","chat_history"])

    # Chain for sequence of call
    chain = load_qa_chain(llama_llm, chain_type="stuff", prompt=prompt, memory=memory)

    return chain


In [None]:
# Function for user input

def user_input(user_question):
  docs = retriever.get_relevant_documents(user_question)
  chain = get_conversational_chain()
  response = chain.invoke({"input_documents": docs, "question": user_question})
  return response

In [None]:
# Function to cite retrived resources    

def citation_function(response):
    print('\nCitation:')
    for source in response["input_documents"]:
        print(source.metadata['Title']," ","Source URL:",source.metadata['Source'])

In [None]:
# Create a loop to continuously interact with QA chain ( TO exit the loop - Enter exit)

while True:
    user_query_=input("User: ")
    print("User: ",user_query_)
    if user_query_.lower() in ['exit', 'quit']:
        print("Exiting the program.")
        break
    response=user_input(user_query_)
    citation_function(response)
    print("\n")

5.2. Include citation within the prompt template (Citation done By LLM according to instruction inside prompt)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.memory import ConversationBufferWindowMemory

# Set up conversation memory
memory=ConversationBufferWindowMemory(k=2,memory_key="chat_history", return_messages=True, input_key="question")

def get_conversational_chain():

    prompt_template ="""
    You are a helpful Teaching Assistant of the McMaster University and your name is 'Mac AI Assistant'.\n
    This is the conversation between a student and and 'Mac AI Assistant". your job is to answer the student's question.\n
    The question can be a new question or follow up. So, you must check the chat histroy given below before you answer the question.\n
    You must answer student's the question based on only context given below.\n
    If the question can not be answered using the information provided in the context, must answer with I don't know, don't try to make up an answer.\n
    Give answers in natural form, without giving context as of what you're doing internally.\n
    Use three sentences maximum. Keep the answer as concise as possible.\n
    If an answer to the question is provided uisng the context data, it must be annotated with a citation at the end of the answer. should use the following format to cite all three sources specified in the Sources after context data. "\nCitation: \nsource-1.pdf Page No: xx-1  Source URL: xxx-1 \nsource-2.pdf Page No: xx-2  Source URL: xxx-2 \nsource-3.pdf Page No: xx-3  Source URL: xxx-3".
    If user question is more general for eaxmple 'Hi', 'Hi there!, 'Thanks', or 'How are you!', then asnwer them like a personal assistant of an user and do not need citation in the asnwer\n
    Always begin your answer with this dialog format:\n'Mac AI Assistant: <your_Answer>' \n\n

    ""context:\n{context}\n\n""

    ""Chat history: \n{chat_history}\n\n""

    ""Student's Question: \n{question}\n\n""

    Answer: """

    # Langchain Prompt template to configure prompt variable
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question","chat_history"])

    # chain for sequence of call
    chain = load_qa_chain(llama_llm, chain_type="stuff", prompt=prompt, memory=memory,verbose=False)

    return chain


In [None]:
from langchain.docstore.document import Document

# Function to add retrievd text and metadata to put into prompt
def Add_text_with_metadata(docs):
    text=""
    data=""

    for doc in docs:
        text+=doc.page_content+"\n"
        data+=doc.metadata['Title']+" Source URL: "+ doc.metadata['Source']+"\n"

    final_text=text +"Sources: \n"+data
    doc =  Document(page_content=f"{final_text}")

    return [doc]  

In [None]:
# Function for user input

def user_input(user_question):
  retrived_docs = retriever.get_relevant_documents(user_question)
  docs=Add_text_with_metadata(retrived_docs)
  chain = get_conversational_chain()
  response = chain.invoke({"input_documents": docs, "question": user_question})
  return response

In [None]:
# Create a loop to continuously interact with QA chain ( TO exit the loop - Enter exit)

while True:
    user_query_=input("User: ")
    print("User: ",user_query_)
    if user_query_.lower() in ['exit', 'quit']:
        print("Exiting the program.")
        break
    response=user_input(user_query_)
    print("\n")