In [34]:
import os, time
import chromadb

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough, RunnablePick
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from langchain import hub

from chromadb.errors import InvalidDimensionException

In [33]:
DATABASE_PATH = '/home/raj/nlp/cmu-rag/rag/chroma/txt/'
# embedding_name = 'llama2'
# persist_directory = DATABASE_PATH + embedding_name
# embedding = OllamaEmbeddings(model=embedding_name)
# vector_store = Chroma(persist_directory=persist_directory, embedding_function=embedding)

EMBEDDING_OPTIONS = ['tinyllama', 'llama2', 'gemma', 'mistral', 'neural-chat', 'openchat']
VECTOR_STORE_DIRECTORIES = [DATABASE_PATH + embedding_name for embedding_name in EMBEDDING_OPTIONS]

QUESTION_CATEGORIES = ['history']
ANNOTATION_DIR = '/home/raj/nlp/cmu-rag/annotation/test/'


In [35]:
def load_vector_store(dir, embedding_name):
    try:
        vector_store = Chroma(persist_directory=dir, embedding_function=OllamaEmbeddings(model=embedding_name))
    except InvalidDimensionException:
        vector_store = Chroma(persist_directory=dir, embedding_function=OllamaEmbeddings(model=embedding_name), force=True)
    return vector_store

def load_vector_store_non_ollama_embedding(dir, embedding_model):
    try:
        vector_store = Chroma(persist_directory=dir, embedding_function=embedding_model)
    except InvalidDimensionException:
        vector_store = Chroma(persist_directory=dir, embedding_function=embedding_model, force=True)
    return vector_store

def create_chain(vector_store, llm_model = 'llama2'):
    rag_prompt_llama = hub.pull("rlm/rag-prompt-llama")
    prompt_message = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use as few words as possible and keep the answer concise. Do not mention the context in your response.
    Question: {question} 
    Context: {context} 
    Answer:"""

    rag_prompt_llama.messages[0].prompt.template = prompt_message

    llm = Ollama(model = llm_model)

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    retriever = vector_store.as_retriever()

    qa_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | rag_prompt_llama
        | llm
        | StrOutputParser()
    )

    return qa_chain


In [36]:
def get_questions(category='', dir = ANNOTATION_DIR):
    questions = []
    for file in os.listdir(dir + category):
        if file.endswith('questions.txt'):
            with open(dir + category + '/' + file, 'r') as f:
                for line in f.readlines():
                    questions.append(line.strip())
    return questions

def generate_answers(qa_chain, questions):
    if not questions:
        raise ValueError("No questions to answer")
    if not qa_chain:
        raise ValueError("No qa_chain to answer questions")
    answers = []
    for question in questions:
        if not question:
            continue
        answer = dict()
        answer_raw = qa_chain.invoke(question)
        answer["raw"] = answer_raw
        num_lines = answer_raw.count('\n')
        answer["num_lines"] = num_lines
        lines = answer_raw.split('\n')
        if num_lines == 0:
            answer["processed"] = lines[0]
        else:
            answer_lines = []
            for line in lines:
                if "i don't know" not in line.lower():
                    answer_lines.append(line)
            answer["processed"] = " ".join(answer_lines)
        answers.append(answer)

    return answers

def write_answers(answers, file_name, append = False):
    with open(file_name, 'a' if append else 'w') as f:
        for answer in answers:
            f.write(answer["processed"] + '\n')


In [38]:
vector_stores, question_categories = ['llama2', 'bge-large-en'], ['history']
# vector_stores, question_categories = VECTOR_STORE_DIRECTORIES, QUESTION_CATEGORIES
embedding_models = [OllamaEmbeddings(model='llama2'), 
                    HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en", model_kwargs={"device": "cuda"}, encode_kwargs={"normalize_embeddings": True})
                    ]


for dir in vector_stores:
    embedding_name = 'llama2'
    # vector_store = load_vector_store(dir, embedding_name)
    vector_store = load_vector_store_non_ollama_embedding(dir, embedding_models[vector_stores.index(dir)])
    chain = create_chain(vector_store, embedding_name)
    for category in question_categories:
        print("Answering questions for category: {} with embedding {}".format(category, embedding_name))
        questions = get_questions(category, dir=ANNOTATION_DIR)
        answers = generate_answers(qa_chain=chain, questions=questions)
        write_to_file = embedding_name + '_answers.txt'
        write_answers(answers, ANNOTATION_DIR + category + '/' + write_to_file)
        print(f"Answers written to {ANNOTATION_DIR + category + '/' + write_to_file}")

Answering questions for category: history with embedding llama2
Answers written to /home/raj/nlp/cmu-rag/annotation/test/history/llama2_answers.txt
Answering questions for category: history with embedding llama2
Answers written to /home/raj/nlp/cmu-rag/annotation/test/history/llama2_answers.txt


In [15]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-large-en"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)


import torch

#clear cache on cuda
torch.cuda.empty_cache()

embedding_name = 'bge-large-en'
vector_store = load_vector_store_non_ollama_embedding(dir=DATABASE_PATH+'bge-large-en', embedding_model=hf)
chain = create_chain(vector_store)
questions = get_questions(category, dir=ANNOTATION_DIR)
answers = generate_answers(qa_chain=chain, questions=questions)
write_to_file = embedding_name + '_answers.txt'
write_answers(answers, ANNOTATION_DIR + category + '/' + write_to_file)


NameError: name 'category' is not defined

In [30]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-large-en"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)


import torch

#clear cache on cuda
torch.cuda.empty_cache()

TEST_QUESTION_PATH='/home/raj/nlp/cmu-rag/rveerara/system_outputs/'
embedding_name = 'bge-large-en'
vector_store = load_vector_store_non_ollama_embedding(dir=DATABASE_PATH+'bge-large-en', embedding_model=hf)
chain = create_chain(vector_store)
questions = get_questions(dir=TEST_QUESTION_PATH)
print(questions)

for i in range(0, len(questions), 10):
    question_set = questions[i:min(i+10, len(questions))]
    answers = generate_answers(qa_chain=chain, questions=question_set)
    write_to_file = TEST_QUESTION_PATH + 'system_output_1.txt'
    print("writing to file for questions: ", i, " to ", i+10, " to file: ", write_to_file)
    write_answers(answers, write_to_file, append=True)

writing to file for questions:  0  to  10  to file:  /home/raj/nlp/cmu-rag/rveerara/system_outputs/system_output_1.txt
writing to file for questions:  10  to  20  to file:  /home/raj/nlp/cmu-rag/rveerara/system_outputs/system_output_1.txt
writing to file for questions:  20  to  30  to file:  /home/raj/nlp/cmu-rag/rveerara/system_outputs/system_output_1.txt
writing to file for questions:  30  to  40  to file:  /home/raj/nlp/cmu-rag/rveerara/system_outputs/system_output_1.txt
writing to file for questions:  40  to  50  to file:  /home/raj/nlp/cmu-rag/rveerara/system_outputs/system_output_1.txt
writing to file for questions:  50  to  60  to file:  /home/raj/nlp/cmu-rag/rveerara/system_outputs/system_output_1.txt
writing to file for questions:  60  to  70  to file:  /home/raj/nlp/cmu-rag/rveerara/system_outputs/system_output_1.txt
writing to file for questions:  70  to  80  to file:  /home/raj/nlp/cmu-rag/rveerara/system_outputs/system_output_1.txt
writing to file for questions:  80  to  9

In [2]:
import argparse

from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.document_compressors import LLMChainFilter, EmbeddingsFilter, FlashrankRerank
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.retrievers import ContextualCompressionRetriever

from langchain import hub

from chromadb.errors import InvalidDimensionException

VECTOR_DATABASES_DIR_PATH = '/home/raj/nlp/cmu-rag/chroma_vector_database/'
VECTOR_STORE_DEFAULT = 'bge-500-0.2'
EMMBEDDING_DEFAULT = 'bge'
ANNOTATION_DIR = '/home/raj/nlp/cmu-rag/rveerara/data/test/'
QUESTIONS_FILE = ANNOTATION_DIR + 'questions.txt'
ANSWERS_FILE = ANNOTATION_DIR + 'answers.txt'
PROMPT_MESSAGE_LLAMA2 = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. Use as few words as possible and keep the answer concise. Do not mention the context in your response.
    Question: {question} 
    Context: {context} 
    Answer:"""

def load_vector_store(dir, embedding_model = OllamaEmbeddings()):
    try:
        vector_store = Chroma(persist_directory=dir, embedding_function=embedding_model)
    except InvalidDimensionException:
        vector_store = Chroma(persist_directory=dir, embedding_function=embedding_model, force=True)
    return vector_store


def create_chain(vector_store, inference_model = Ollama(model='llama2'), prompt_message = PROMPT_MESSAGE_LLAMA2, embedding_model = OllamaEmbeddings()):
    
    rag_prompt_llama = hub.pull("rlm/rag-prompt-llama")
    rag_prompt_llama.messages[0].prompt.template = prompt_message
    llm = inference_model

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)
    
    retriever = vector_store.as_retriever(search_kwargs={"k": 20})
    # embeddings_filter = EmbeddingsFilter(embeddings=embedding_model, similarity_threshold=0.5)
    compressor = FlashrankRerank()
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=retriever
    )

    qa_chain = (
        {"context": compression_retriever | format_docs, "question": RunnablePassthrough()}
        | rag_prompt_llama
        | llm
        | StrOutputParser()
    )

    return qa_chain


def get_questions(file_name = QUESTIONS_FILE):
    
    if not file_name.endswith('questions.txt'):
        raise ValueError("Invalid file name")
    
    questions = []
    with open(file_name, 'r') as file:
        for line in file.readlines():
            questions.append(line.strip())
    
    return questions


def generate_answers(qa_chain, questions):
    if not questions:
        raise ValueError("No questions to answer")
    if not qa_chain:
        raise ValueError("No qa_chain to answer questions")
    
    answers = []
    for question in questions:
        if not question:
            continue
        answer = dict()
        answer_raw = qa_chain.invoke(question)
        answer["raw"] = answer_raw
        num_lines = answer_raw.count('\n')
        answer["num_lines"] = num_lines
        lines = answer_raw.split('\n')
        if num_lines == 0:
            answer["processed"] = lines[0]
        else:
            answer_lines = []
            for line in lines:
                if "i don't know" not in line.lower():
                    answer_lines.append(line)
            answer["processed"] = " ".join(answer_lines)
        answers.append(answer)

    return answers


def write_answers(answers, file_name, append = False):
    try:
        with open(file_name, 'a' if append else 'w') as f:
            for answer in answers:
                f.write(answer["processed"] + '\n')
    except Exception as e:
        raise Exception("Error writing answers to file: " + str(e))

def do_rag_in_chunks(vector_store_path=VECTOR_DATABASES_DIR_PATH+VECTOR_STORE_DEFAULT,
        embedding_model = None,
        inference_model=Ollama(model='llama2'),
        questions_file_name=QUESTIONS_FILE,
        answers_file_name=ANSWERS_FILE,
        append=False,
        questions_to_process_at_once=50,):
    if not embedding_model:
        raise ValueError("Invalid embedding model")
    vector_store = load_vector_store(vector_store_path, embedding_model)
    qa_chain = create_chain(vector_store, inference_model=inference_model, embedding_model=embedding_model)
    questions = get_questions(file_name=questions_file_name)
    num_questions = len(questions)
    for i in range(0, num_questions, questions_to_process_at_once):
        questions_chunk = questions[i:min(i+questions_to_process_at_once, num_questions)]
        answers = generate_answers(qa_chain, questions_chunk)
        write_answers(answers, answers_file_name, append=True) if i > 0 else write_answers(answers, answers_file_name, append=append)
        print(f"Processed {i+questions_to_process_at_once} questions out of {num_questions}")


def get_hugging_face_embedding_model():
    model_name = "BAAI/bge-large-en"
    model_kwargs = {"device": "cuda"}
    encode_kwargs = {"normalize_embeddings": True}
    hf = HuggingFaceBgeEmbeddings(
        model_name=model_name, 
        model_kwargs=model_kwargs, 
        encode_kwargs=encode_kwargs
    )
    return hf

def parse_arguments():
    
    parser = argparse.ArgumentParser(description='RAG Chain')
    parser.add_argument('--vector', type=str, default=VECTOR_DATABASES_DIR_PATH+VECTOR_STORE_DEFAULT, help='Path to the directory containing the vector store')
    parser.add_argument('--embed', type=str, default=EMMBEDDING_DEFAULT, help='Embedding model to be used for loading embeddings')
    parser.add_argument('--model', type=str, default='llama2', help='Model name to be used for read documents and generate answers')
    parser.add_argument('--questions', type=str, default=QUESTIONS_FILE, help='Path to the file containing questions')
    parser.add_argument('--answers', type=str, default=ANSWERS_FILE, help='Path to the file where answers will be written')
    parser.add_argument('--append', type=bool, default=False, help='Append answers to the file')
    
    args = parser.parse_args()

    return args

# embedder_path = '/home/raj/nlp/cmu-rag/chroma_vector_database/bge-500-0.2'
# embedding_model = get_hugging_face_embedding_model()
# vector_store = load_vector_store_non_ollama_embedding(embedder_path, embedding_model)
# chain = create_chain(vector_store, embedding_model=embedding_model)
# questions = get_questions(QUESTIONS_FILE)

# if __name__ == "__main__":

#     print("Starting RAG Chain")

#     args = parse_arguments()
#     vector_store_path = args.vector
#     embedding_model_option = args.embed
#     model_name = args.model
#     questions_file_name = args.questions
#     answers_file_name = args.answers
#     append = args.append

#     embedding_model = get_hugging_face_embedding_model() if embedding_model_option == 'bge' else OllamaEmbeddings(model=embedding_model_option)
#     inference_model = Ollama(model=model_name)
    
#     print("Starting RAG Chain with the following parameters:")
#     print(f"\tVector Store Path: {vector_store_path}")
#     print(f"\tEmbedding Model: {embedding_model.__class__.__name__}")
#     print((f"\tInference Model: {inference_model.__class__.__name__}") + (f" ({model_name})" if model_name else ""))
#     print(f"\tQuestions File: {questions_file_name}")
#     print(f"\tAnswers File: {answers_file_name}")
#     print(f"\tAppend: {append}")

#     do_rag_in_chunks(vector_store_path=vector_store_path,
#         embedding_model=embedding_model,
#         inference_model=inference_model,
#         questions_file_name=questions_file_name,
#         answers_file_name=answers_file_name,
#         append=append)
    
#     print("Done")


In [6]:
embedder_path = '/home/raj/nlp/cmu-rag/chroma_vector_database/bge-3000-0.2'
embedding_model = get_hugging_face_embedding_model()
vector_store = load_vector_store(embedder_path, embedding_model)
# chain = create_chain(vector_store, embedding_model=embedding_model)
questions = get_questions(QUESTIONS_FILE)



In [7]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [10]:
retriever = vector_store.as_retriever(search_kwargs={"k": 15})
# embeddings_filter = EmbeddingsFilter(embeddings=embedding_model, similarity_threshold=0.5)

embeddings = get_hugging_face_embedding_model()
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.6)

flashranker = FlashrankRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=embeddings_filter, base_retriever=retriever
)

# question = questions[0]
question = "In which conference was the paper titled \"Why do Nearest Neighbor Language Models Work?\" published ?"
print(question)

docs= compression_retriever.get_relevant_documents(question, k=5)
# doc1 = compression_retriever.get_relevant_documents(question, k=50)

# print(len(doc1))
pretty_print_docs(docs)

import nltk
from nltk import sent_tokenize, word_tokenize

cnt = 0

for doc in docs:
    cnt += len(word_tokenize(doc.page_content))

print(cnt)

In which conference was the paper titled "Why do Nearest Neighbor Language Models Work?" published ?
Document 1:

Title: Why do Nearest Neighbor Language Models Work?
Abstract: Language models (LMs) compute the probability of a text by sequentially computing a representation of an already-seen context and using this representation to predict the next word. Currently, most LMs calculate these representations through a neural network consuming the immediate previous context. However recently, retrieval-augmented LMs have shown to improve over standard neural LMs, by accessing information retrieved from a large datastore, in addition to their standard, parametric, next-word prediction. In this paper, we set out to understand why retrieval-augmented language models, and specifically why k-nearest neighbor language models (kNN-LMs) perform better than standard parametric LMs, even when the k-nearest neighbor component retrieves examples from the same training set that the LM was originally 

In [68]:
str1 = """

Instructor  Saadati teaches course number  49750 ,titled ' Integrated Thinking for Innovation' in semester  Fall 2023, under the category/department of  Integrated Innovation Institute, for section  A1. The course consists of  6.0 units and is taught on the following days:  in the building '  ', which is located at   . The course begins at    and ends at   .
Instructor  Carja teaches course number  02702 ,titled ' Computational Biology Seminar' in semester  Spring 2024, under the category/department of  Computational Biology, for section  A. The course consists of  3.0 units and is taught on the following days: Friday in the building ' TBA', which is located at  Pittsburgh, Pennsylvania. The course begins at  1030AM and ends at  1150AM.
----------------------------------------------------------------------------------------------------

"""


str2 = """
Title: Crossing the Threshold: Idiomatic Machine Translation through Retrieval Augmentation and Loss Weighting
Abstract: Idioms are common in everyday language, but often pose a challenge to translators because their meanings do not follow from the meanings of their parts. Despite significant advances, machine translation systems still struggle to translate idiomatic expressions. We provide a simple characterization of idiomatic translation and related issues. This allows us to conduct a synthetic experiment revealing a tipping point at which transformer-based machine translation models correctly default to idiomatic translations. To expand multilingual resources, we compile a dataset of ~4k natural sentences containing idiomatic expressions in French, Finnish, and Japanese. To improve translation of natural idioms, we introduce two straightforward yet effective techniques: the strategic upweighting of training loss on potentially idiomatic sentences, and using retrieval-augmented models. This not only improves the accuracy of a strong pretrained MT model on idiomatic sentences by up to 13% in absolute accuracy, but also holds potential benefits for non-idiomatic sentences.
Authors: Emmy Liu, Aditi Chaudhary, Graham Neubig
Publication Venue: Conference on Empirical Methods in Natural Language Processing, Empir Method Nat Lang Process, Empirical Methods in Natural Language Processing, Conf Empir Method Nat Lang Process, EMNLP
Year of Publication: 2023
Summary: Not Available
"""
print(len(str2))
print(len(word_tokenize(str2)))

1499
234
