In [1]:
import sys 
sys.path.insert(0, "../")

from src.neo4j_functions import Neo4jConnection
from src import ReaderMetrics

import json
import requests
from langchain_huggingface import HuggingFaceEmbeddings
import os
from typing import List, Dict, Tuple
from time import sleep, time
from tqdm import tqdm
import numpy as np
import joblib
import gc
from collections import Counter
import torch
import chromadb
from scipy.spatial import distance

NEO4J_URL ="bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PWD = "neo4j"
NEO4J_DBNAME = "neo4j"

####

SAVE_GRAPH_SHORT_PATHS_DIR = '../data/graph_short_paths'
STAGE1_DIR = 'stage1'
STAGE2_DIR = 'stage2'

METADATA_FILENAME = 'metadata.json'
ADJM_FILENAME = 'adjency_matrix'
DISTM_FILENAME = 'distances_matrix'

In [2]:
def check_create_dir(new_dir_path: str):
    if os.path.exists(new_dir_path):
        print("Директория существует")
        raise ValueError
    else:
        os.mkdir(new_dir_path)

def save_json(data: Dict[str, object], save_path: str):
    dump = json.dumps(data, ensure_ascii=False, indent=1)
    with open(save_path, 'w', encoding='utf-8') as fd:
        fd.write(dump)

#### Stage 1. Build adjency matrix

In [3]:
S1_META = {
    'NEO4J_URL': NEO4J_URL,
    'SAVE_ADJM_VERSION': 'v1'
}

save_log_dir = f"{SAVE_GRAPH_SHORT_PATHS_DIR}/{STAGE1_DIR}/{S1_META['SAVE_ADJM_VERSION']}"
save_metadata_file = f"{save_log_dir}/{METADATA_FILENAME}" 
save_adjm_file = f"{save_log_dir}/{ADJM_FILENAME}"

check_create_dir(save_log_dir)

In [4]:
GRAPH_MODEL = Neo4jConnection(uri=NEO4J_URL, user=NEO4J_USER, pwd=NEO4J_PWD)

In [5]:
nodes = GRAPH_MODEL.execute_query("MATCH (a) RETURN a",db=NEO4J_DBNAME)
nodes_id = list(map(lambda v: v['a'].element_id, nodes))
print(len(nodes))

1604


In [6]:
ADJENCY_M_INFO = {
    'INF': 1e5,
    'ID_TO_INDEX_MAP': {id: i for i, id in enumerate(nodes_id)},
    'INDEX_TO_ID_MAP': {i: id for i, id in enumerate(nodes_id)},
    'MATRIX': None
}
ADJENCY_M_INFO['MATRIX'] = np.full((len(nodes_id), len(nodes_id)), ADJENCY_M_INFO['INF'])
for i in range(len(ADJENCY_M_INFO['MATRIX'])):
    ADJENCY_M_INFO['MATRIX'][i][i] = 0

In [7]:
for base_idx, base_node_id in tqdm(ADJENCY_M_INFO['INDEX_TO_ID_MAP'].items()):
    adjenced_nodes = GRAPH_MODEL.execute_query(f'MATCH (a)-[rel]-(b) WHERE elementId(a) = "{base_node_id}" RETURN elementId(b)')
    for adj_node in adjenced_nodes:
        ADJENCY_M_INFO['MATRIX'][base_idx][ADJENCY_M_INFO['ID_TO_INDEX_MAP'][adj_node['elementId(b)']]] = 1
        ADJENCY_M_INFO['MATRIX'][ADJENCY_M_INFO['ID_TO_INDEX_MAP'][adj_node['elementId(b)']]][base_idx] = 1

100%|██████████| 1604/1604 [00:04<00:00, 336.95it/s]


In [8]:
joblib.dump(ADJENCY_M_INFO, save_adjm_file)
save_json(S1_META, save_metadata_file)

#### Stage 2. Find nodes short distances/paths

In [3]:
S2_META = {
    'LOAD_ADJM_VERSION': 'v1',
    'SAVE_DISTS_VERSION': 'v1'
}

save_log_dir = f"{SAVE_GRAPH_SHORT_PATHS_DIR}/{STAGE2_DIR}/{S2_META['SAVE_DISTS_VERSION']}"
save_metadata_file = f"{save_log_dir}/{METADATA_FILENAME}" 
save_dists_file = f"{save_log_dir}/{DISTM_FILENAME}"
load_adjm_file = f"{SAVE_GRAPH_SHORT_PATHS_DIR}/{STAGE1_DIR}/{S2_META['LOAD_ADJM_VERSION']}/{ADJM_FILENAME}"

check_create_dir(save_log_dir)

In [4]:
DISTS_INFO = joblib.load(load_adjm_file)
DISTS_INFO['PATHS_MATRIX'] = np.full((len(DISTS_INFO['MATRIX']), len(DISTS_INFO['MATRIX'])), None)

In [5]:
# Алгорити Флойда: находим длины кратчайших путей между вершинами
# https://neerc.ifmo.ru/wiki/index.php?title=%D0%90%D0%BB%D0%B3%D0%BE%D1%80%D0%B8%D1%82%D0%BC_%D0%A4%D0%BB%D0%BE%D0%B9%D0%B4%D0%B0
for k in tqdm(range(len(DISTS_INFO['MATRIX']))):
    for i in range(len(DISTS_INFO['MATRIX'])):
        for j in range(len(DISTS_INFO['MATRIX'])):
            if (DISTS_INFO['MATRIX'][i][k] < DISTS_INFO['INF']) and (DISTS_INFO['MATRIX'][k][j] < DISTS_INFO['INF']):
                if (DISTS_INFO['MATRIX'][i][k] + DISTS_INFO['MATRIX'][k][j]) < DISTS_INFO['MATRIX'][i][j]:
                    DISTS_INFO['MATRIX'][i][j] = DISTS_INFO['MATRIX'][i][k] + DISTS_INFO['MATRIX'][k][j]
                    DISTS_INFO['PATHS_MATRIX'][i][j] = k if DISTS_INFO['PATHS_MATRIX'][k][j] is None else DISTS_INFO['PATHS_MATRIX'][k][j] 

100%|██████████| 1604/1604 [1:08:06<00:00,  2.55s/it]


In [6]:
joblib.dump(DISTS_INFO, save_dists_file)
save_json(S2_META, save_metadata_file)

##### quick check

In [11]:
Counter(DISTS_INFO['MATRIX'].reshape(-1,)) # граф является связным

Counter({4.0: 981248,
         3.0: 855476,
         5.0: 394420,
         2.0: 255644,
         6.0: 73424,
         1.0: 10696,
         0.0: 1604,
         7.0: 278,
         8.0: 26})

In [17]:
start_idx, end_idx = 69, 67

Path = [] 
tmp_idx = end_idx
while tmp_idx is not None: 
    Path.append(tmp_idx) 
    tmp_idx = DISTS_INFO['PATHS_MATRIX'][start_idx][tmp_idx] 
Path = Path[::-1]
print(Path)

[68, 1392, 25, 67]
