SEP 775 Final Project Code (Evaluation of AI Teaching Assistant)

Submitted By Group-4 :

Rutvik, Damjibhai Roy – 400490159

Meetkumar Patel - 400547066

Mridu - 400547058

Srey Patel - 400546143

In [None]:
# Install necessary packages

!pip install langchain==0.1.14
!pip install sentence-transformers==2.6.1
!pip install faiss-cpu==1.8.0
!pip install pdfminer.six==20231228
!pip install llama_cpp_python==0.2.58
!pip install tiktoken==0.6.0
!pip install gdown==5.1.0

# **1. Load Documents (PDF files of SEP 775 Course Material )**

In [None]:
# Download llama-2 model from gdrive (do not need to download again if have already run implimentation code file)

import gdown

id='1fJPVfJssRO-PXaHKkxMe4BZ2PS4QwXBn'
url = f"https://drive.google.com/uc?id={id}"
output = "llama-2-7b-chat.Q4_K_M.gguf"
gdown.download(url, output)

In [None]:
# Download all pdf files from gdrive

import gdown

id="1SmvI9knJH6gKjeyNNnU9vcjFzsjOAYd7"

!mkdir '/content/All_lecture_pdfs'
output='/content/All_lecture_pdfs'

documents=gdown.download_folder(id=id,output=output, quiet=True)

# **2. Extract Text and Metadata**

In [None]:
# Define function to get last modified time
import os
import time

def get_modified_time(file):
  ti_m = os.path.getmtime(file)
  m_ti = time.ctime(ti_m)
  t_obj = time.strptime(m_ti)
  # Transforming the time object to a timestamp of ISO 8601 format
  T_stamp = time.strftime("%Y-%m-%d %H:%M:%S", t_obj)
  return T_stamp

def replace_newlines(text):
    # Replace newline and carriage return + line feed characters with spaces
    return text.replace('\n', ' ').replace('\r\n', ' ').replace('\x0c', ' ')

def fix_missing_spaces(text):
    # Split text into sentences
    sentences = text.split('. ')
    # Add space after period for each sentence
    fixed_text = '. '.join(sentence + (' ' if i < len(sentences) - 1 else '') for i, sentence in enumerate(sentences))
    return fixed_text

In [None]:
# Extract text from course materials

from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer
from pdfminer.pdfpage import PDFPage

def prepare_docs(pdf_docs):
    docs = []
    metadata = []
    content = []

    for pdf in pdf_docs:
      for page_number, page_layout in enumerate(extract_pages(pdf), start=1):
        # Extract text from the page layout
        text = ""
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text += element.get_text()

        text = replace_newlines(text)
        text = fix_missing_spaces(text)
        doc_page = {'title': pdf.split("/")[-1] + " page " + str(page_number),
                    'last_modified_time': get_modified_time(pdf),
                    'content': text,
                    'file_page': "empty_url"}
        docs.append(doc_page)

    for doc in docs:
        content.append(doc["content"])
        metadata.append({
            "title": doc["title"],
            "last_modified_time": doc["last_modified_time"],
            "file_page": doc["file_page"]
        })

    print("Content and metadata are extracted from the documents")

    return content, metadata

In [None]:
# split extracted text into chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

def get_text_chunks(content, metadata):

    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=512,chunk_overlap=15)
    split_docs = text_splitter.create_documents(content, metadatas=metadata)
    print(f"Documents are split into {len(split_docs)} passages")

    return split_docs

# **3. Vector Database**

In [None]:
# vector database

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})

def ingest_into_vectordb(split_docs):
    embeddings = embedding_model
    db = FAISS.from_documents(split_docs, embeddings)

    DB_FAISS_PATH = 'vectorstore/db_faiss'
    db.save_local(DB_FAISS_PATH)
    return db

In [None]:
content, metadata = prepare_docs(documents)

In [None]:
split_docs = get_text_chunks(content, metadata)

In [None]:
vectordb=ingest_into_vectordb(split_docs)

# **4. Database Retriver**

In [None]:
# set retriver

retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [None]:
user_query="What is RNN?"
query_embedding = embedding_model.embed_query(user_query)
print(query_embedding)

In [None]:
# test retriver

docs = retriever.get_relevant_documents(user_query)

for i,doc in enumerate(docs):
  print(f"Retrived Chunk - {i+1} :",docs[i].page_content)
  print(" ")
  print("Title:",doc.metadata["title"])
  print("Last_modified_time:",doc.metadata["last_modified_time"])
  print("URL:",doc.metadata["file_page"])
  print("-"*40)
  print(" ")

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **5. RAG Conversational Chain**

In [None]:
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llama_llm = LlamaCpp(
model_path="llama-2-7b-chat.Q4_K_M.gguf",
n_gpu_layers=15000, # The number of layers to put on the GPU.
n_threads=6,     # Number of CPU core
n_batch=512,
temperature=0.8,
max_tokens=512,
top_p=0.95,
callback_manager=callback_manager,
n_ctx=4096,
verbose=True,
streaming=True)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain

def get_conversational_chain():

    prompt_template ="""
    You are a helpful Teaching Assistant of the McMaster University and your name is 'Mac AI Assistant'\n
    You must answer the question based on context given below.\n
    Give answers in natural form, without giving context as of what you're doing internally.\n
    you must not mention that you are answering question based on the context in your asnwer
    If the question can not be answered using the information provided answer with I don't know, don't try to make up an answer.\n
    Use three sentences maximum. Keep the answer as concise as possible.\n
    If user question is more general for eaxmple 'Hi', 'Hi there!, 'Thanks', or 'How are you!', then asnwer them like a personal assistant of an user\n
    Always begin your answer with this dialog format:\n 'Mac AI Assistant: <your_Answer>' \n\n

    context:\n{context}\n

    User Question: \n{question}\n

    Answer: """

    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(llama_llm, chain_type="stuff", prompt=prompt)

    return chain


In [None]:
def user_input(user_question):
  docs = retriever.get_relevant_documents(user_question)
  chain = get_conversational_chain()
  response = chain.invoke({"input_documents": docs, "question": user_question})
  return response

In [None]:
import time
import nltk
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd

# Load QA pairs from the CSV file
qa_pairs_csv = "QA_Pairs.csv"
qa_pairs = pd.read_csv(qa_pairs_csv, header=None)

bleu_score = []
time_taken = []

print(qa_pairs.head())

for row,qa_pair in qa_pairs.iterrows():
    user_query_ = qa_pair[0]
    print("User: ",user_query_)
    start_time = time.time()
    responce=user_input(user_query_)
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60  # Convert to minutes

    print(" ")
    print("Citation:","Title -",responce['input_documents'][0].metadata["title"],"URL -", responce['input_documents'][0].metadata["file_page"])
    print("\n")
    print(f"Time taken to predict the answer: {execution_time_minutes:.2f} minutes")
    time_taken.append(f"{execution_time_minutes:.2f}")

    actual = responce['output_text']
    predicted = qa_pair[1]
    
    print("Predicted:", predicted)
    print("Actual:", actual)

    # Calculate BLEU score
    bleu_score.append(sentence_bleu([predicted], actual))
    print("BLEU Score for Reference:", bleu_score)

In [None]:
import matplotlib.pyplot as plt

pairs = [f"Pair {i+1}" for i in range(10)]

plt.figure(figsize=(10, 6))
plt.bar(pairs, bleu_score, color='skyblue')
plt.xlabel('Pairs')
plt.ylabel('BLEU Score')
plt.title('BLEU Scores for 10 pairs of Actual and Predicted Answers')
plt.ylim(0, 1) 
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

ans = [f"Ans {i+1}" for i in range(10)]

plt.figure(figsize=(10, 6))
plt.bar(ans, bleu_score, color='lightgreen')
plt.xlabel('Pairs')
plt.ylabel('Time taken ( in mins )')
plt.title('Time taken to generate Answers')
plt.ylim(0, 1) 
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()