In [1]:
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
import pandas as pd
import json
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.embeddings import Embeddings
import pickle
import numpy as np
import ast
from time import time
from scipy.spatial import distance
from tqdm import tqdm
import joblib

BASE_DIR = "/home/dzigen/Desktop/PersonalAI/Personal-AI"

# remote 
#NEO4J_URL ="bolt://31.207.47.254:7687"
#NEO4J_USER = "neo4j"
#NEO4J_PWD = "password"

# local
NEO4J_URL ="bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PWD = "neo4j"

import sys 
sys.path.insert(0, "../")

from src.neo4j_functions import Neo4jConnection

In [2]:
class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self, embedder):
        self.embedder = embedder
    def __call__(self, input: Documents) -> Embeddings:
        return self.embedder.embed_documents(input)

In [3]:
# !!! BELOW TO CHANGE !!! 
DATA_NAME = 'vectorized_nodes'
NODES_DISTANCES_FILE = 'nodes_distances_matrix'
DB_VERSION = 'v7'
GRAPH_DB_NAME = 'neo4j'


EMBEDDING_MODEL_PATH = f'{BASE_DIR}/models/intfloat/multilingual-e5-small'
MODEL_KWARGS = {'device': 'cuda'}
ENCODE_PROMPTS = {"query": "query: ", "passage": "passage: "}
ENCODE_KWARGS = {'normalize_embeddings': True, 'prompt': ENCODE_PROMPTS['passage']}
CHROMA_KWARGS = {"hnsw:space": "ip"}
# !!! ABOVE TO CHANGE !!!

SAVE_DIR = f"../data/{DATA_NAME}/{DB_VERSION}"
DENSE_DB_SAVE_PATH = f'{SAVE_DIR}/densedb'
DB_LOG_PATH = f'{SAVE_DIR}/operation_info.json' 
NODE_DISTS_SAVE_PATH = f'{SAVE_DIR}/{NODES_DISTANCES_FILE}'

#### Preparing

In [4]:
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_PATH,
    model_kwargs=MODEL_KWARGS,
    encode_kwargs=ENCODE_KWARGS 
)
ef = MyEmbeddingFunction(embeddings)

No sentence-transformers model found with name /home/dzigen/Desktop/PersonalAI/Personal-AI/models/intfloat/multilingual-e5-small. Creating a new one with MEAN pooling.


In [5]:
client = chromadb.PersistentClient(path=DENSE_DB_SAVE_PATH)
collection = client.get_or_create_collection(name=DATA_NAME,  metadata=CHROMA_KWARGS, 
                                             embedding_function=ef)

In [6]:
conn = Neo4jConnection(uri=NEO4J_URL, user=NEO4J_USER, pwd=NEO4J_PWD)

In [7]:
output = conn.execute_query("MATCH (n) RETURN n", db=GRAPH_DB_NAME)

In [8]:
graph_entities = list(map(lambda item: (item['n'].element_id, item['n']['name']), output))

documents = list(map(lambda item: item[1], graph_entities))
metadata = list(map(lambda item: {'node_id': item[0]}, graph_entities))

In [9]:
conn.close()

#### Vectorizing nodes

In [10]:
vectorize_t_start = time()

collection.add(
    documents=documents,
    metadatas=metadata,
    ids=list(map(lambda v: v['node_id'], metadata))
)

VECTORIZE_ELAPSED_TIME = round(time() - vectorize_t_start, 5)

In [11]:
collection.count()

1604

#### Calculating distances between nodes

In [12]:
distances = {
    'l2': distance.euclidean,
    'ip': lambda v1, v2: 1- np.dot(v1, v2)
}

In [13]:
DISTANCES_INFO = {
    'ID_TO_INDEX_MAP': {item['node_id']: i for i, item in enumerate(metadata)},
    'INDEX_TO_ID_MAP': {i: item['node_id'] for i, item in enumerate(metadata)},
    'MATRIX': np.full((len(metadata), len(metadata)), 0.0)
}

In [14]:
data = collection.get(include=['embeddings'])

In [15]:
for i in tqdm(range(len(data['ids'])-1)):
    n1_id = data['ids'][i]
    n1_emb = data['embeddings'][i]
    for j in range(i+1, len(data['ids'])):
        n2_id = data['ids'][j]
        n2_emb = data['embeddings'][j]
        
        n12_dist = distances[CHROMA_KWARGS["hnsw:space"]](n1_emb, n2_emb)
        DISTANCES_INFO['MATRIX'][DISTANCES_INFO['ID_TO_INDEX_MAP'][n1_id]][DISTANCES_INFO['ID_TO_INDEX_MAP'][n2_id]] = n12_dist
        DISTANCES_INFO['MATRIX'][DISTANCES_INFO['ID_TO_INDEX_MAP'][n2_id]][DISTANCES_INFO['ID_TO_INDEX_MAP'][n1_id]] = n12_dist

  0%|          | 2/1603 [00:00<01:27, 18.29it/s]

100%|██████████| 1603/1603 [00:43<00:00, 37.21it/s] 


In [16]:
joblib.dump(DISTANCES_INFO, NODE_DISTS_SAVE_PATH)

['../data/vectorized_nodes/v7/nodes_distances_matrix']

#### Saving Log

In [17]:
with open(DB_LOG_PATH, 'w') as fd:
    fd.write(json.dumps({
        "data_name": DATA_NAME, "graphdb_name": GRAPH_DB_NAME,
        "db_version": DB_VERSION, "model_name": EMBEDDING_MODEL_PATH,
        "encode_kwargs": ENCODE_KWARGS, "chroma_kwargs": CHROMA_KWARGS,
        "encode_prompts": ENCODE_PROMPTS,
        "vectorize_elapsed_sec_time": VECTORIZE_ELAPSED_TIME}, indent=1))