In [None]:
from datetime import datetime

from knowledge_graph.modules.document.document_layer import DocumentLayer
from knowledge_graph.modules.document.document_handler import DocumentHandler
from knowledge_graph.modules.entity.er_extractor import ERExtractor
from knowledge_graph.modules.entity.entity_layer import EntityLayer
from knowledge_graph.modules.node import EntityNode, ContentNode
from configuration.configurations import ERExtractorConfiguration
from configuration.llm_inference_configuration import APILLMConfiguration, LocalLLMConfiguration
from configuration.embedding_inference_configuration import APIEmbeddingModelConfiguration, LocalEmbeddingModelConfiguration
from llm.language_models.azure_gpt import AzureGPT
from llm.language_models.hf_local_model import HuggingfaceLocalInference
from embedding.embedding_models.hf_embedding import HFLocalEmbeddingModel 
from exception.entity_exception import EntityDuplicationInOneContentNodeError

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Local embedding models
emb_config = LocalEmbeddingModelConfiguration()
emb_config.load(path='configuration/toml/intfloat_ml_e5.toml')
emb_model = HFLocalEmbeddingModel(embedding_model_config=emb_config)

# Local LLM models
triplet_extraction_llm_config = LocalLLMConfiguration()
triplet_extraction_llm_config.load(path='configuration/toml/triplex.toml')
triplet_extraction_llm = HuggingfaceLocalInference(llm_config=triplet_extraction_llm_config)

conference_resolution_llm_config = LocalLLMConfiguration()
conference_resolution_llm_config.load(path='configuration/toml/gemma2_ft.toml')
conference_resolution_llm = HuggingfaceLocalInference(llm_config=conference_resolution_llm_config)

# API LLM models
judgement_llm_config = APILLMConfiguration()
judgement_llm_config.load(path='configuration/toml/gpt_4o.toml')
judgement_llm = AzureGPT(llm_config=judgement_llm_config)

In [2]:
emb_config = EmbeddingModelConfiguration()
er_config = ERExtractorConfiguration()
azure_gpt_config = APILLMConfiguration()
kg_config = Neo4jConfiguration()
emb_config.load("configuration/toml/hf_embedding.toml")
er_config.load("configuration/toml/extractor.toml")
azure_gpt_config.load("configuration/toml/azure_openai.toml")
kg_config.load("configuration/toml/neo4j.toml")

emb_model = HFEmbeddingModel(embedding_model_config=emb_config)
azure_gpt = AzureGPT(azure_gpt_config)
entity_extractor = ERExtractor(azure_gpt, er_config)

knowledge_graph = KnowledgeGraph(kg_config)
graph_ds = KnowledgeGraphDataScience(kg_config)
node_matcher = NodeMatcher(knowledge_graph)
rel_matcher = RelationshipMatcher(knowledge_graph)

document_layer = DocumentLayer(knowledge_graph)
document_layer.load_embedding_model(emb_model)
entity_layer = EntityLayer(knowledge_graph)
entity_layer.load_embedding_model(emb_model)
entity_layer.load_graph_ds(graph_ds)
community_layer = CommunityLayer(knowledge_graph)
community_layer.load_node_matcher(node_matcher)
community_layer.load_rel_matcher(rel_matcher)
community_layer.load_llm(azure_gpt)
community_layer.load_embedding_model(emb_model)

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
DOCUMENT_DIRPATH = r"data/processed/md/test copy"

document_node_1 = DocumentNode(document_id="1", title="draft_1", version="1.0", last_update=str(datetime.now()), attached_to=[], document_directory="")
document_node_2 = DocumentNode(document_id="2", title="draft_2", version="1.0", last_update=str(datetime.now()), attached_to=["1"], document_directory="")
document_node_3 = DocumentNode(document_id="3", title="draft_3", version="1.0", last_update=str(datetime.now()), attached_to=["1"], document_directory="")
document_node_4 = DocumentNode(document_id="4", title="draft_4", version="1.0", last_update=str(datetime.now()), attached_to=["1"], document_directory="")
document_node_5 = DocumentNode(document_id="5", title="draft_5", version="1.0", last_update=str(datetime.now()), attached_to=["3"], document_directory="")
document_node_6 = DocumentNode(document_id="6", title="test copy", version="1.0", last_update=str(datetime.now()), attached_to=[], document_directory=DOCUMENT_DIRPATH)

# document_layer.add_document_node_to_graph(document_node_1)
# document_layer.add_document_node_to_graph(document_node_2)
# document_layer.add_document_node_to_graph(document_node_3)
# document_layer.add_document_node_to_graph(document_node_4)
# document_layer.add_document_node_to_graph(document_node_5)
# document_layer.add_document_node_to_graph(document_node_6)

# document_layer.create_document_tree(document_node=document_node_6)

In [None]:
entity_layer.load_er_extractor(entity_extractor)

content_nodes = document_layer.get_content_nodes_by_document(document_node_6)

for content_node in content_nodes:
    entity_layer.er_process_content_node(content_node=content_node, num_trials=1)

In [None]:
similar_entities = entity_layer.find_similar_entity_nodes_in_given_document_node(document_node=document_node_6)
similar_entities

In [None]:
for group in similar_entities:
    entity_layer.merge_entity_nodes_from_id_list(entity_node_ids=group, forced=True)

In [None]:
entity_layer.entity_clustering()

In [3]:
community_layer.create_community_nodes()
community_layer.community_ranks()

In [None]:
_community_nodes = community_layer.get_community_nodes()
community_nodes = [CommunityNode(**raw_community_node) for raw_community_node in _community_nodes]

for community_node in [CommunityNode(**raw_community_node) for raw_community_node in community_nodes]:
    community_layer.community_node_info_aggregation(community_node=community_node)

## Retrieval

In [7]:
user_query = "What happened to deep learning at 1990s?"
user_query_embedding = emb_model.encode(user_query)
top_k_retrieval = 5

In [9]:
# Retrieve top-k community nodes based on user query
level = 1
community_nodes_by_level = node_matcher.match("__Community__", level=level).all()

similar_summary_comparison = [user_query_embedding@community_node['vector_emb'] for community_node in community_nodes_by_level]
top_k_community_nodes = sorted(zip(similar_summary_comparison, community_nodes_by_level), key=lambda x: x[0], reverse=True)[:top_k_retrieval]

top_k_community_nodes_id_content = {}
for community_node in top_k_community_nodes:
    community_node_id = community_node[1]['community_id']
    community_node_content = community_node[1]['summary_content']
    top_k_community_nodes_id_content[community_node_id] = community_node_content

top_k_community_nodes_id_content

{'1-4': 'Deep learning, a subset of machine learning that utilizes neural networks with multiple layers, has seen significant advancements thanks to the use of graphical processing units (GPUs). Initially developed to accelerate graphics processing for computer games, GPUs are optimized for high throughput matrix-vector products. This optimization has proven to be a game changer, making deep learning tasks feasible and more efficient.',
 '1-9': 'Traditional methods in data science, such as linear and kernel methods, are grounded in convex optimizations. Linear methods and kernel methods are specific types of traditional methods that leverage these mathematical techniques to solve various problems. However, with the advent of deep models, which are neural networks with many layers, there has been a significant shift in performance capabilities. When provided with large amounts of data, deep models can substantially outperform traditional methods, showcasing their advanced capabilities i

In [None]:
from cgitb import text
import json
from nlp.basic import string_similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Embedding similarity function using cosine similarity
def embedding_similarity(embedding1: list, embedding2: list) -> float:
    # Ensure the embeddings are numpy arrays
    emb1 = np.array(embedding1).reshape(1, -1)
    emb2 = np.array(embedding2).reshape(1, -1)

    # Compute the cosine similarity between the two embeddings
    similarity = cosine_similarity(emb1, emb2)

    return similarity[0][0]  # Cosine similarity returns a 2D array, so we take the first value

# Extract entities using GPT model
entity_extraction_message = [
    {"role": "system", "content": "You are a specialist in linguistics, assisting with entity and relationship extraction for the construction of a knowledge graph."},
    {"role": "user", "content": f"Extract all the entities from the given text, output in JSON format ({{'entities': [List of extracted entities]}}): {user_query}"}
]
raw_response = azure_gpt.chat(entity_extraction_message)
text_response = azure_gpt.get_response_texts(raw_response)[0].strip()

In [None]:
json_output = json.loads(text_response)
extracted_entities = json_output['entities']

# Retrieve all entity nodes from Neo4j graph
raw_entity_nodes = node_matcher.match("__Entity__").all()

# Weights for name similarity and embedding similarity (you can adjust these based on importance)
name_weight = 0.5
embedding_weight = 0.5

# Loop over extracted entities and find the most similar entity in the graph
similar_entity_nodes = []

for extracted_entity in extracted_entities:
    best_match = None
    best_combined_similarity = 0.0

    # Assume the extracted entity has its own embedding (this can be created using the emb_model)
    extracted_entity_embedding = emb_model.encode(extracted_entity)

    for raw_entity_node in raw_entity_nodes:
        entity_node = EntityNode(**raw_entity_node)
        entity_name = entity_node.name
        entity_name_embedding = entity_node.name_emb

        # Calculate string similarity between the extracted entity and the entity node's name
        name_similarity = string_similarity(extracted_entity, entity_name)

        # Calculate embedding similarity between the extracted entity's embedding and the entity node's name embedding
        embedding_similarity_score = embedding_similarity(extracted_entity_embedding, entity_name_embedding)

        # Combine the two similarities using a weighted sum
        combined_similarity = (name_similarity * name_weight) + (embedding_similarity_score * embedding_weight)

        # Keep track of the entity node with the highest combined similarity score
        if combined_similarity > best_combined_similarity:
            best_combined_similarity = combined_similarity
            best_match = entity_node

    # Only add the match if the combined similarity score is above a certain threshold (e.g., 0.85)
    if best_match and best_combined_similarity > 0.85:
        similar_entity_nodes.append({
            'extracted_entity': extracted_entity,
            'similar_entity_node': best_match,
            'combined_similarity_score': best_combined_similarity
        })

# Output the results
for match in similar_entity_nodes:
    print(f"Extracted Entity: {match['extracted_entity']}")
    print(f"Most Similar Entity in Graph: {match['similar_entity_node'].name}")
    print(f"Combined Similarity Score: {match['combined_similarity_score']:.2f}")