In [1]:
import json
import re
import pandas as pd

from typing import AsyncIterator, Iterator

from datetime import datetime, timedelta



from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_chroma import Chroma


import os
from dotenv import load_dotenv

from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
# from langchain_ollama import OllamaEmbeddings

from langchain_community.vectorstores.utils import filter_complex_metadata


import shutil

import argparse
from tqdm import tqdm

import chromadb
import uuid

import pickle

In [2]:
load_dotenv(override=True)

EMBEDDING_URL = os.getenv("EMBEDDING_URL")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
DB_PATH = os.getenv("DB_PATH")

shutil.rmtree(DB_PATH, ignore_errors=True)


if EMBEDDING_URL and EMBEDDING_MODEL:
    emb = OllamaEmbeddings(base_url=EMBEDDING_URL, model=EMBEDDING_MODEL, show_progress=True)
    print("model: ", EMBEDDING_MODEL, "base_url: ", EMBEDDING_URL)
else:
    emb = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True)
    print("model: text-embedding-3-small")



model:  bge-m3 base_url:  http://localhost:11434


  emb = OllamaEmbeddings(base_url=EMBEDDING_URL, model=EMBEDDING_MODEL, show_progress=True)


In [3]:
# class pklLoader(BaseLoader):
#     def __init__(self, file_path: str, doc_type: str = None) -> None:
#         self.file_path = file_path
#         self.doc_type = doc_type

    
#     def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
#         # Load the data from the file
#         # df = pd.read_pickle(self.file_path)
#         with open(self.file_path, 'rb') as f:
#             pkl_list = pickle.load(f)
        
#         for i, row in pkl_list:
#             yield Document(
#                 page_content=row['content'],
#                 metadata=row['metadata'],
#                 doc_type=self.doc_type
#                 )


In [4]:
# def create_vectorstore(docs_path, db_path = ".chroma_db/"):
#     all_docs = []
#     file_name_list = ['events.pkl', 'latest_updates.pkl', 'fellows.pkl', 'pages.pkl']
#     doc_type_list = ['event', 'update', 'fellow', 'page']
#     for file_name, doc_type in zip(file_name_list, doc_type_list):
#         loader = pklLoader(os.path.join(docs_path, file_name), doc_type)
#         all_docs.extend(loader.load())
#     print("Number of documents: ", len(all_docs))
#     return Chroma.from_documents(all_docs, emb, db_path)

In [5]:
def loadPKL(file_path, doc_type):
    with open(file_path, 'rb') as f:
        pkl_list = pickle.load(f)
    page_contents = [row['content'] for row in pkl_list]
    metadata = [dict(row['metadata'], doc_type=doc_type) for row in pkl_list]
    
    embeddings = emb.embed_documents(page_contents)
    return page_contents, metadata, embeddings
    
    

In [6]:
def create_chroma_client(docs_path, db_path = ".chroma_db/"):
    file_name_list = ['events.pkl', 'latest_updates.pkl', 'fellows.pkl', 'pages.pkl']
    doc_type_list = ['event', 'update', 'fellow', 'page']
    
    all_contents = []
    all_metadatas = []
    all_embeddings = []
    
    for file_name, doc_type in zip(file_name_list, doc_type_list):
        print("Loading ", doc_type)
        page_contents, metadata, embeddings = loadPKL(os.path.join(docs_path, file_name), doc_type)
        all_contents.extend(page_contents)
        all_metadatas.extend(metadata)
        all_embeddings.extend(embeddings)
    
    for metadata in all_metadatas:
        if len(metadata) == 0:
            metadata = None
    
    all_ids = [str(uuid.uuid4()) for _ in range(len(all_contents))]
    
    persistent_client = chromadb.PersistentClient(path=db_path)
    collection = persistent_client.get_or_create_collection("langchain")
    collection.add(ids=all_ids, documents=all_contents, embeddings=all_embeddings, metadatas=all_metadatas)
    
    
    return persistent_client



In [7]:
chroma_client = create_chroma_client(docs_path="data/")

Loading  event


OllamaEmbeddings:   0%|          | 0/45 [00:00<?, ?it/s]

OllamaEmbeddings: 100%|██████████| 45/45 [00:37<00:00,  1.21it/s]


Loading  update


OllamaEmbeddings: 100%|██████████| 8/8 [00:05<00:00,  1.57it/s]


Loading  fellow


OllamaEmbeddings: 100%|██████████| 47/47 [00:28<00:00,  1.65it/s]


Loading  page


OllamaEmbeddings: 100%|██████████| 11/11 [00:07<00:00,  1.55it/s]
