## How to run LLaMa 2 on a Mac M1 and use local text
### See https://swharden.com/blog/2023-07-30-ai-document-qa/

#### Build a local vector DB [FAISS (Facebook AI Similarity Search)] with embeddings from local text file


In [3]:
"""
This script creates a database of information gathered from local text files.
"""

from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import re

# Convert pdf to txt
def pdf2txt(splitter):
    # define what documents to load
    pdfFiles = PyPDFLoader("./mock-data/Syllabus.pdf")
    
    # interpret information in the documents
    pages = pdfFiles.load_and_split()

    texts = splitter.split_documents(pages)
    # Format text from pdf to increase accuracy
    for index,text in enumerate(texts):
        temp = text.page_content.replace("\n"," ")
        temp = re.sub("[\s]{2,}", " ", temp)

        texts[index].page_content = temp
        print(text)
        print(temp, file=open("./mock-data/SyllabusText.txt", "a"), end="\n\n")
        print()

splitter = RecursiveCharacterTextSplitter(chunk_size=500, #original size was 500
                                          chunk_overlap=50,
                                          separators=["\n\n","\n","(?<=\. )", " ",""],
                                          length_function = len) 

# Check if SyllabusText.txt exists
try:
    f = open("./mock-data/SyllabusText.txt")
    f.close()
except FileNotFoundError:
    pdf2txt(splitter)


# define what documents to load
textFiles = DirectoryLoader("./mock-data/", glob="SyllabusText.txt", loader_cls=TextLoader) #Store data in *.txt file in the ./mock-data/ directory

# interpret information in the documents
documents = textFiles.load()

#print(documents)

texts = splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'mps'})

# create and save the local database
db = FAISS.from_documents(texts, embeddings)
db.save_local("faiss")

#### Setup Template for prompting

In [20]:
"""
This script reads the database of information from local text files
and uses a large language model to answer questions about their content.
"""

from langchain.llms import CTransformers
from langchain.llms import LlamaCpp
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain import PromptTemplate
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import (
    StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
)
from langchain.memory import ConversationBufferMemory
from langchain.chains import AnalyzeDocumentChain

# prepare the template we will use when prompting the AI
template = """You are a helpful, respectful and honest teaching assistant for a college algorithms course. Always answer as helpfully as possible, while being safe.  
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. 
Please ensure that your responses are socially unbiased and positive in nature. 
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. 
If you don't know the answer to a question, please don't share false information.
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
Only return the helpful answer below and nothing else.
Helpful answer:
"""

# Test with Conversation Chain
llamaTemplate = """
You are a helpful, respectful and honest teaching assistant for a college algorithms course. Always answer as helpfully as possible, while being safe.  
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. 
Please ensure that your responses are socially unbiased and positive in nature. 
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. 
If you don't know the answer to a question, please don't share false information, just say you don't know.

Current Conversation:
{history}
Human: {input}
AI: 
""".strip()


qaTemplate = """
You are a helpful, respectful and honest teaching assistant for a college algorithms course. Always answer as helpfully as possible, while being safe.  
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. 
Please ensure that your responses are socially unbiased and positive in nature. 
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. 
If you don't know the answer to a question, please don't share false information, just say you don't know.
Please answer clear and consise.
Only use the following pieces of information to answer the user's question.

Current Conversation:
{chat_history}

Human: {question}
AI: 

"""

llamaTemplatev2 = """
[INST] <<SYS>> 
You are a helpful, respectful and honest teaching assistant in the Computer Science Dept in UT Dallas. 
Do not greet the user.
If you are asked for coding or mathematical solutions, say 'As a TA, I'm cannot provide you solutions, but can guide you.' \
and provide reference to documentation.
If you don't know the answer to a question, please don't share false information. Instead say 'I don't know, please contact the TA.'
<</SYS>>
Context: {history} 
Question: {input} 
Only return the helpful answer below and nothing else. Keep your response to less than 5 sentences. 
Helpful answer:[/INST]
"""

qaTemplatev2 = """
[INST] <<SYS>> 
You are a helpful, respectful and honest teaching assistant in the Computer Science Dept in UT Dallas. 
If you are asked for coding or mathematical solutions, say 'As a TA, I'm cannot provide you solutions, but can guide you.' \
and provide reference to documentation. 
If you don't know the answer to a question, please don't share false information. 
At the end of every response say 'Please refer to the syllabus for more information.' \
Syllabus Topics include: Instructor, Grading, Course Description, Course Objectives, Course Topics, Course Materials, Room and Time, \
<</SYS>>
Context: {chat_history} 
Question: {question} 
Only return the helpful answer below and nothing else. Keep your response to less than 5 sentences. 
Helpful answer:[/INST]
"""



chainPrompt = PromptTemplate(input_variables=["history", "input"], template = llamaTemplatev2)
qaPrompt = PromptTemplate(input_variables=["chat_history", "question"], template = qaTemplatev2)

#### Set up AI Chat LLM

In [21]:
model_path = "./llama2/llama.cpp/models/13B/ggml-model-q4_0.bin" # Path to Quantized Llama Model (13B) compiled for Apple Silicon
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 4096  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
n_context = 4096
last_n_tokens_size = 300
max_new_tokens = 4096 #max is 4096 tokens for LLaMa 2


# load the language model
# llm = CTransformers(model=model_path,
#                     model_type='llama',
#                     config={'max_new_tokens': 1024, 'temperature': 0.01})


# Callbacks support token-wise streaming
# Allows Realtime output of each token
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


# OLD Models
# # Make sure the model path is correct for your system!
# llm = LlamaCpp(
#     model_path=model_path,
# 	temperature=0.01,
# 	max_tokens=1024,
# 	n_context=n_context,
#     n_gpu_layers=n_gpu_layers,
#     n_batch=n_batch,
# 	last_n_tokens_size=last_n_tokens_size,
#     f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
#     #callback_manager=callback_manager,
#     verbose=False, # Verbose is required to pass to the callback manager
# )

# llm = LlamaCpp(
#     model_path=model_path ,
#     n_gpu_layers=n_gpu_layers,
#     task='text-generation',
#     return_full_text=True,
#     n_batch=n_batch,
#     top_p=0.9,
#     top_k=40,
#     max_new_tokens=2048, #max is 4096 tokens for LLaMa 2
#     f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
#     verbose=False, # Verbose is required to pass to the callback manager
#     temperature= 0.0,
#     repetition_penalty=1.8
# )

llm = LlamaCpp(
	model_path=model_path,
	n_gpu_layers=n_gpu_layers,
	task='text-generation',
	return_full_text=True,
	n_batch=n_batch,
	n_ctx=n_context,
	last_n_tokens_size=last_n_tokens_size,
	max_new_tokens=max_new_tokens,
	top_p=0.9,
	top_k=40,
	f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
	verbose=False, # Verbose is required to pass to the callback manager
	temperature= 0.1,
	repetition_penalty=1.8
)

                task was transferred to model_kwargs.
                Please confirm that task is what you intended.
                return_full_text was transferred to model_kwargs.
                Please confirm that return_full_text is what you intended.
                max_new_tokens was transferred to model_kwargs.
                Please confirm that max_new_tokens is what you intended.
                repetition_penalty was transferred to model_kwargs.
                Please confirm that repetition_penalty is what you intended.
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from ./llama2/llama.cpp/models/13B/ggml-model-q4_0.bin (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  5120, 32000,  

#### Load local embeddings to use with LLM

In [22]:
# load the interpreted information from the local database
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'mps'})
db = FAISS.load_local("faiss", embeddings)

# prepare a version of the llm pre-loaded with the local content
retriever = db.as_retriever(search_kwargs={'k': 6})
prompt = PromptTemplate(
    template=template,
    input_variables=['context', 'question'])
qa_llm = RetrievalQA.from_chain_type(llm=llm,
                                     chain_type='stuff',
                                     retriever=retriever,
                                     return_source_documents=True,
                                     chain_type_kwargs={'prompt': prompt})


#### Get Results

In [5]:
# ask the AI chat about information in our local files
query = "in software testing if you were to use > instead of >=, what is considered executing the fault?"
output = qa_llm({'query': query})
print(output['result'])

#FIXED: Context size is exceeded if asked "What is this course about? What topics?"

# Output holds: (query, result, source_documents) where source_documents is a list of Document objects, which holds the text and metadata of the source documents
# for key, value in output.items():
# 	print(f"{key}: {value}")


In software testing, using ">" instead of ">=" would not execute the fault. The reason is that ">" is a strict comparison operator that only returns true if the left operand is strictly greater than the right operand. Since ">=" allows for equality, using ">" instead of ">=" would not allow for the possibility of equality, and therefore would not execute the fault.


#### ~~External Memory~~ DISCONTINUED

In [None]:
# IDEA: Vector Store Memory, hopefully will help with context window issue by using it as external memory
# FAILED: Fixes Context Window issue, but makes model take 10x as long to respond
import faiss
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.memory import VectorStoreRetrieverMemory
from langchain.embeddings import LlamaCppEmbeddings


embedding_size = 5120
index = faiss.IndexFlatL2(embedding_size)
embedding_fn = LlamaCppEmbeddings(model_path=model_path).embed_query
vectorstore = FAISS(embedding_fn, index, InMemoryDocstore({}), {})
ret = vectorstore.as_retriever(search_kwargs=dict(k=1))
memory = VectorStoreRetrieverMemory(retriever=ret)

memory.save_context({"input": "What is Dynamic programming?"}, {"output": "A way of solving problems by breaking them down into subproblems."})

#### Continous Conversation

In [96]:
# Test Just Chatting with the AI

# FIXED: Longer Conversations will lead to error with max context length being exceeded
# FIXED: AI will decide to pull information from syllabus even when unneccessary (was doing this because it didn't have conversational context)
# FIXED: Generative AI are not using previous queries in context
# ISSUE: Doesn't know when to use document querying model or generative AI (can't run DQ model always because it doesn't work with CoversationChain)
from langchain.chains.question_answering import load_qa_chain

memory = ConversationBufferWindowMemory(memory_key="history", k=4, return_only_outputs=True)
chain = ConversationChain(llm=llm, memory=memory, prompt=chainPrompt, verbose=False)

# qa_memory = ConversationBufferMemory(memory_key="chat_history", input_key="question", k=4, return_messages=True)
# qa_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, condense_question_prompt=qaPrompt, memory=qa_memory, verbose=False)

qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce")
qa_document_chain = AnalyzeDocumentChain(combine_docs_chain=qa_chain)


vectordbkwargs = {"search_distance": 0.9}
doc = ""
with open("./mock-data/SyllabusText.txt", "r") as f: doc = f.read()

while True:
	query = input("Query: ")
	if (query == ""): continue
	if (query == "goodbye"): 
		# if qa_convo != None:
		# 	print("\n\n\n", qa_convo)
		break

	#qaOutput = qa_llm({'query': query})
	#genOutput = llm()
	# convo = chain.predict(input=query)
	# qa_convo = qa_chain({"question": query, "vectordbkwargs": vectordbkwargs})
	qa_optim_convo = qa_document_chain.run(input_document=doc, question=query)


	print("Human: ", query)
	#print("Query Answer Model: ", qaOutput['result'])
	#print("General LLM: ", genOutput)
	# print("Conversational LLM: ", convo)
	# print("Conversational QA LLM:", qa_convo['answer'])
	print("Optimized Coversational QA LLM:", qa_optim_convo)


	

Human:  How to contact the professor?
Optimized Coversational QA LLM:  The professor's office phone number is 972-883-2185, and their email address is anjum.chida@utdallas.edu.


#### Zero Shot Classification

In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

text_to_classify = "how much of my grade is the final?"
candidate_labels = ["Algorithms", "Design", "Grading", "Course Description", "Room and Time", "Time Complexity", "Space Complexisity"]

classifier(text_to_classify, candidate_labels)


{'sequence': 'how much of my grade is the final?',
 'labels': ['Grading',
  'Time Complexity',
  'Room and Time',
  'Design',
  'Course Description',
  'Space Complexisity',
  'Algorithms'],
 'scores': [0.7427520751953125,
  0.07356199622154236,
  0.04839838668704033,
  0.04265403002500534,
  0.03611021861433983,
  0.030494609847664833,
  0.02602861449122429]}

#### Testing having all 3 models working at the same time

In [24]:
from transformers import pipeline

QA_ONLY = True
CONVERSATION_ONLY = False
assert not(QA_ONLY and CONVERSATION_ONLY), "Cannot run both QA and Conversation Only"

# Set up memory and langchain for both QA and Conversation
memory = ConversationBufferWindowMemory(memory_key="history", k=4, return_only_outputs=True)
chain = ConversationChain(llm=llm, memory=memory, prompt=chainPrompt, verbose=True)

# question_generator_chain = LLMChain(llm=llm, prompt=qaPrompt, verbose=False)
qa_memory = ConversationBufferMemory(memory_key="chat_history", input_key="question", k=4, return_messages=True)
qa_chain = ConversationalRetrievalChain.from_llm(llm=llm, chain_type="stuff", retriever=retriever, verbose=False)

# Set up pipeline for Zero-Shot Classification
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
candidate_labels = ["Algorithms", "Design", "Proofs", "Asymptomatic analysis", "Big O", "Dynamic Programming", "Greedy Methods", "Grading", "Course Description", "Class Time", "Class Location", "Professor Contact Info", "Participation", "Assignment Detals"]
syllabus_labels = ["Grading", "Course Description", "Class Time", "Class Location", "Professor Contact Info", "Participation", "Assignment Detals"]
classification_object = None

while True:
	# Get Inputs
	query = input("Query: ")
	if (query == ""): continue
	if (query == "goodbye"):
		if(classification_object != None):
			print("\n\n\n", classification_object)
		break

	# Print Query
	print("Human: ", query)

	# Classify Query as either QA or Conversation
	classification_object = classifier(query, candidate_labels)
	classification = classification_object['labels'][0]

	modelType = {"QA": 0.0, "Conversation": 0.0}
	for label in classification_object['labels']:
		if label in syllabus_labels:
			modelType["QA"] += classification_object['scores'][classification_object['labels'].index(label)]
		else:
			modelType["Conversation"] += classification_object['scores'][classification_object['labels'].index(label)]

	# Use the model that has the highest score
	if (modelType["QA"] > modelType["Conversation"] or QA_ONLY) and not(CONVERSATION_ONLY):
		qa_convo = qa_chain({"question": query, "chat_history": ""})
		print("Syllabus LLM:", qa_convo['answer'])
		# print("\n\n", qa_chain.memory)
	else:
		convo = chain.predict(input=query)
		print("Conversational LLM: ", convo)



Human:  when is assignment 4 due?
Syllabus LLM:  Assignment 4 is due on eLearning on Dec 5.



 {'sequence': 'when is assignment 4 due?', 'labels': ['Class Time', 'Assignment Detals', 'Grading', 'Asymptomatic analysis', 'Participation', 'Professor Contact Info', 'Course Description', 'Design', 'Class Location', 'Algorithms', 'Dynamic Programming', 'Big O', 'Proofs', 'Greedy Methods'], 'scores': [0.15747833251953125, 0.10639947652816772, 0.1058751568198204, 0.09549389779567719, 0.08384324610233307, 0.07712666690349579, 0.05866086855530739, 0.05738820880651474, 0.05236165598034859, 0.049499817192554474, 0.04115424305200577, 0.04115214943885803, 0.037145670503377914, 0.03642052412033081]}
