In [2]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-large-en"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)


In [None]:
BASE_DIR = '/home/raj/nlp/cmu-rag/rag'
BASE_DIR_TXT_FILES = '/home/raj/nlp/cmu-rag/helper/combined_txt_files/'
DATABASE_PATH = '/home/raj/nlp/cmu-rag/rag/chroma/txt/'

CPU_RAM = 26

embedding_options = ['tinyllama', 'llama2', 'gemma', 'everythiglm', 'mistral', 'neural-chat', 'openchat']

print("Available embeddings: ", embedding_options)

documents = []
for file in os.listdir(BASE_DIR_TXT_FILES):
    if "schedule" not in file:
        print("Processing text file: ", file)
        loader = TextLoader(BASE_DIR_TXT_FILES + file)
        documents.extend(loader.load())
        

print("Splitting documents into chunks")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
doc_text = splitter.split_documents(documents)

# print(doc_text)

embedding_times = [0] * len(embedding_options)

print("Creating embeddings...")
for embedding in embedding_options:
    print("Preparing embedding: ", embedding)
    start_time = time.time()
    vector_database = None
    try: 
        vector_database = Chroma.from_documents(documents = doc_text, embedding=OllamaEmbeddings(model = embedding), persist_directory=DATABASE_PATH+embedding)
        vector_database.persist()
    except InvalidDimensionException as e:
        print("Invalid dimension for embedding: ", embedding)
        continue
    except Exception as e:
        print("Error: ", e)
        continue
    finally:
        end_time = time.time()
    print("Finished embedding: " + embedding + " in " + str(end_time - start_time) + " seconds")
    embedding_times[embedding_options.index(embedding)] = end_time - start_time
print("Done all embeddings of all documents in " + str(sum(embedding_times)) + " seconds!")