In [2]:
# %pip install -r ../requirements.txt
# %pip install --quiet --upgrade  langchain langchain-community langchainhub gpt4all chromadb bs4 torch transformers
# !pip freeze >> ../requirements.txt

import os, time
import chromadb

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from chromadb.errors import InvalidDimensionException

In [3]:
BASE_DIR = '/home/raj/nlp/cmu-rag/rag'
BASE_DIR_TXT_FILES = '/home/raj/nlp/cmu-rag/helper/combined_txt_files/'
DATABASE_PATH = '/home/raj/nlp/cmu-rag/rag/chroma/txt/'

EMBEDDING_OPTIONS = ['llama2', 'everythinglm', 'mistral', 'neural-chat', 'openchat', 'BGE']
# print("Available embeddings: ", EMBEDDING_OPTIONS)

In [22]:
def create_and_store_embedding(documents, embedding_model, embedding_model_name, persist_directory=DATABASE_PATH):
    if not documents or not embedding_model:
        raise Exception("Documents and embedding model are required for creating embeddings")
    
    start_time = time.time()
    # embedding_model_name = str(embedding_model.__class__)
    print("Creating embeddings for: ", embedding_model_name)
    vector_store = None
    try:
        vector_store = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=persist_directory + embedding_model_name)
        vector_store.persist()
        print("Embeddings created and stored for: ", embedding_model_name)
    except InvalidDimensionException as e:
        print("Invalid dimension for embedding: ", embedding_model_name)
    except Exception as e:
        print("Error: ", e)
    finally:
        end_time = time.time()
    print("Operation completed in: ", str(end_time - start_time), " seconds")

def retrieve_embeddings(embedding_model, embedding_model_name, persist_directory=DATABASE_PATH):
    vector_store = None
    # embedding_model_name = str(embedding_model.__class__)
    try:
        vector_store = Chroma.from_documents(embedding=embedding_model, persist_directory=persist_directory+embedding_model_name)
    except Exception as e:
        print("Error: ", e)
    return vector_store

def read_documents(dir=BASE_DIR_TXT_FILES):
    documents = []
    for file in os.listdir(dir):
        # if file.endswith(".txt"):
        loader = TextLoader(BASE_DIR_TXT_FILES + file)
        documents.extend(loader.load())
    return documents

def chunk_documents(documents, chunk_size=1000, chunk_overlap=0.1):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap*chunk_size)
    chunks = splitter.split_documents(documents)
    return chunks

# def create_embeddings(documents, embedding_models, persist_directory=DATABASE_PATH):
#     for model in embedding_models:
#         create_and_store_embedding(documents=documents, embedding_model=model, persist_directory=persist_directory)

NameError: name 'DATABASE_PATH' is not defined

In [10]:
model_name = "BAAI/bge-large-en"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}
hf_bge_embedding = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)


docs = read_documents()
print(len(docs))
chunks = chunk_documents(docs)
vector_store = create_and_store_embedding(documents=chunks, embedding_model=hf_bge_embedding, embedding_model_name="bge-large-en")

13
Creating embeddings for:  bge-large-en
Embeddings created and stored for:  bge-large-en
Operation completed in:  416.1851649284363  seconds


In [24]:
str(hf_bge_embedding.__class__)

"<class 'langchain_community.embeddings.huggingface.HuggingFaceBgeEmbeddings'>"

In [6]:
from langchain_community.document_loaders import JSONLoader, PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def get_documents_from_csv_file(file_path):
    loader = CSVLoader(
        file_path=file_path)
    docs = loader.load()
    # print(len(docs))
    return docs


def get_documents_from_json_file(file_path):
    loader = JSONLoader(
        file_path=file_path,
        jq_schema='.[]',
        text_content=False)
    docs = loader.load()
    print(len(docs))
    return docs

def get_documents_from_PDF_file(file_path='/home/raj/nlp/cmu-rag/data/documents/_Combined_All_Files/Acads_LTI_handbook-msaii-2022-2023.pdf'):
    loader = PyPDFLoader(
        file_path=file_path)
    docs = loader.load()
    print(len(docs))
    return docs

# docs = get_documents_from_csv_file('/home/raj/nlp/cmu-rag/data/documents/_Combined_All_Files/History_scs_hisotry_events.csv')
docs = get_documents_from_PDF_file()

# print(docs[0])
# print(docs[0].page_content)


print(docs)

def chunk_documents(documents, chunk_size=1000, chunk_overlap=0.5):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap*chunk_size)
    chunks = splitter.split_documents(documents)
    return chunks

chunks = chunk_documents(docs, chunk_size=3000, chunk_overlap=0.5)





# print(len(chunks))

avg_chunk_len = 0
long_chunk_len = 0
cnt = 0
for chunk in chunks:
    for c in chunk.page_content.split('{')[1:-1]:
        avg_chunk_len += len(c)
        long_chunk_len = max(long_chunk_len, len(c))
        cnt += 1
# avg_chunk_len = avg_chunk_len/cnt


print(avg_chunk_len, long_chunk_len)

53
0 0
