In [2]:
# %pip install -r ../requirements.txt
# %pip install --quiet --upgrade  langchain langchain-community langchainhub gpt4all chromadb bs4 torch transformers
# !pip freeze >> ../requirements.txt

import os, time
import chromadb

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader

from chromadb.errors import InvalidDimensionException

In [3]:
BASE_DIR = '/home/raj/nlp/cmu-rag/rag'
BASE_DIR_TXT_FILES = '/home/raj/nlp/cmu-rag/helper/combined_txt_files/'
DATABASE_PATH = '/home/raj/nlp/cmu-rag/rag/chroma/txt/'

CPU_RAM = 26

# os.chdir(BASE_DIR)
# !touch embedding_options.txt
# !ollama list > embedding_options.txt

embedding_options = ['tinyllama', 'llama2', 'gemma', 'everythiglm', 'mistral', 'neural-chat', 'openchat']
# try:
#     with open('embedding_options.txt', 'r') as file:
#         for line in file.readlines()[1:]:
#             line_content = line.split()
#             if float(line_content[2]) < 0.8 * CPU_RAM or line_content[3] == 'MB':
#                 embedding_options.append(line_content[0].split(":")[0])
# except Exception as e:
#     print("Error: ", e)
# finally:
#     file.close()
#     !rm embedding_options.txt

print("Available embeddings: ", embedding_options)

documents = []
for file in os.listdir(BASE_DIR_TXT_FILES):
    if "schedule" not in file:
        print("Processing text file: ", file)
        loader = TextLoader(BASE_DIR_TXT_FILES + file)
        documents.extend(loader.load())
        

print("Splitting documents into chunks")
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
doc_text = splitter.split_documents(documents)

# print(doc_text)

embedding_times = [0] * len(embedding_options)

print("Creating embeddings...")
for embedding in embedding_options:
    print("Preparing embedding: ", embedding)
    start_time = time.time()
    vector_database = None
    try: 
        # chroma_client = chromadb.PersistentClient(path=DATABASE_PATH)
        # collection = chroma_client.create_collection(name=embedding, embedding_function=OllamaEmbeddings(model = embedding))
        # collection.add_documents(all_text)
        vector_database = Chroma.from_documents(documents = doc_text, embedding=OllamaEmbeddings(model = embedding), persist_directory=DATABASE_PATH+embedding)
        vector_database.persist()
    except InvalidDimensionException as e:
        print("Invalid dimension for embedding: ", embedding)
        continue
    except Exception as e:
        print("Error: ", e)
        continue
    finally:
        end_time = time.time()
    print("Finished embedding: " + embedding + " in " + str(end_time - start_time) + " seconds")
    embedding_times[embedding_options.index(embedding)] = end_time - start_time
print("Done all embeddings of all documents in " + str(sum(embedding_times)) + " seconds!")

Available embeddings:  ['tinyllama', 'llama2', 'gemma', 'everythiglm', 'mistral', 'neural-chat', 'openchat']
Processing text file:  history_of_cmu
Processing text file:  Kiltie Band
Processing text file:  spring carnival and renuion weekend.txt
Processing text file:  Tartan Facts
Processing text file:  academic_calendars
Processing text file:  program_handbooks
Processing text file:  lti_faculty
Processing text file:  history_of_scs
Processing text file:  About Scottie
Processing text file:  lti_papers_metadata
Processing text file:  Buggy News
Processing text file:  lti_programs
Splitting documents into chunks
Creating embeddings...
Preparing embedding:  tinyllama
Finished embedding: tinyllama in 151.55805349349976 seconds
Preparing embedding:  llama2
Finished embedding: llama2 in 703.6288022994995 seconds
Preparing embedding:  gemma
Finished embedding: gemma in 816.3149952888489 seconds
Preparing embedding:  everythiglm
Error:  Error raised by inference API HTTP code: 404, {"error":"