In [4]:
import pandas as pd
df=pd.read_pickle("rebel_entities.pkl")
print(list(df)[:5])

['Q3285447', 'Q42317667', 'Q7068799', 'Q5443871', 'Q7918771']


In [10]:
df=pd.read_pickle("rebel_relations.pkl")
print(list(df)[:5]
      )

['P103', 'P1571', 'P485', 'P571', 'P6438']


In [8]:
df=pd.read_pickle("mapping.pkl")
print(df[:5])

TypeError: unhashable type: 'slice'

In [None]:
import pickle
import random
import numpy as np
from collections import Counter
from math import pow
import networkx as nx  # Import NetworkX

# --- Load NetworkX graph from 'final_graph.pkl' ---
try:
    with open('final_graph.pkl', 'rb') as file:
        # G is now a NetworkX Graph object
        G = pickle.load(file)
    
    if not isinstance(G, nx.Graph):
        print("Warning: The loaded object may not be a NetworkX graph.")

    print(f"Successfully loaded NetworkX graph from 'final_graph.pkl'.")
    print(f"The graph contains {G.number_of_nodes():,} nodes and {G.number_of_edges():,} edges.")

except FileNotFoundError:
    print("Error: 'final_graph.pkl' file not found.")
    print("Please ensure the file is in the same directory as your Python script.")
    exit()
except Exception as e:
    print(f"An error occurred while loading or processing the file: {e}")
    exit()

# --- Extract triples ---

RELATION_ATTRIBUTE_KEY = 'relation'

print(f"Extracting triples from the NetworkX graph (relation attribute key: '{RELATION_ATTRIBUTE_KEY}')...")
KG_triplets = []  # List to store all triples
KG_dict = {}      # Dictionary for fast lookup

# Iterate over each edge in the graph to construct triples
# G.edges(data=True) returns (source node, target node, attribute dictionary)
for u, v, data in G.edges(data=True):
    # Retrieve the relation from the attribute dictionary, if it exists
    relation = data.get(RELATION_ATTRIBUTE_KEY)
    if relation:
        # Create a triple (subject, predicate, object)
        triplet = (u, relation, v)
        KG_triplets.append(triplet)
        
        # Populate KG_dict for fast lookup
        s, p, o = triplet
        if s not in KG_dict:
            KG_dict[s] = []
        if o not in KG_dict:
            KG_dict[o] = []
        KG_dict[s].append(triplet)
        KG_dict[o].append(triplet)

if not KG_triplets:
    print("Error: Failed to extract any triples from the graph.")
    print(f"Please check whether the edges in your NetworkX graph contain the attribute '{RELATION_ATTRIBUTE_KEY}'.")
    exit()

print(f"Extraction complete. Obtained {len(KG_triplets):,} valid triples.")

# --- Step 1: Function to select a “starting” triplet (no modification required) ---
def get_start_triplet(kg_triplets_list, dampening_factor=0.01):
    relations = [p for _, p, _ in kg_triplets_list]
    if not relations:
        raise ValueError("The knowledge graph is empty or contains no relations.")
    
    relation_counts = Counter(relations)
    relations_unique = list(relation_counts.keys())
    weights = [pow(relation_counts[p], dampening_factor) for p in relations_unique]
    chosen_relation = random.choices(relations_unique, weights=weights, k=1)[0]
    possible_triplets = [triplet for triplet in kg_triplets_list if triplet[1] == chosen_relation]
    
    return random.choice(possible_triplets)

# --- Step 2: Function to build a “coherent triplet set” (no modification required) ---
def sample_coherent_triplet_set(kg_lookup_dict, start_triplet, bias_factor=7.0, mean_size=3.0):
    num_triplets = np.random.poisson(lam=mean_size)
    if num_triplets == 0:
        return []

    triplet_set = [start_triplet]
    entities_in_set = {start_triplet[0], start_triplet[2]}

    for r in range(2, num_triplets + 1):
        if not entities_in_set:
            break
        anchor_entity = random.choice(list(entities_in_set))
        
        candidate_triplets = [
            triplet for triplet in kg_lookup_dict.get(anchor_entity, [])
            if triplet not in triplet_set
        ]

        if not candidate_triplets:
            continue

        weights = []
        N = len(entities_in_set)
        
        for triplet in candidate_triplets:
            other_entity = triplet[0] if triplet[2] == anchor_entity else triplet[2]
            
            if other_entity in entities_in_set:
                base = N + 1 - r
                weight = pow(base, bias_factor) if base > 0 else 0
            else:
                weight = 1.0
            weights.append(weight)

        if sum(weights) == 0:
            continue
        
        chosen_triplet = random.choices(candidate_triplets, weights=weights, k=1)[0]
        
        triplet_set.append(chosen_triplet)
        entities_in_set.add(chosen_triplet[0])
        entities_in_set.add(chosen_triplet[2])
        
    return triplet_set

# --- Main program: Run sampling using data converted from the NetworkX graph ---
if __name__ == "__main__":
    BIAS_FACTOR = 7.0
    DAMPENING_FACTOR = 0.01
    MEAN_TRIPLET_SET_SIZE = 3.0
    
    print("-" * 30)

    # 1. Select a starting triplet from the extracted triplet list
    print("Sampling starting triplet...")
    start_triplet = get_start_triplet(KG_triplets, dampening_factor=DAMPENING_FACTOR)
    print(f"Randomly selected starting triplet: {start_triplet}")
    print("-" * 30)

    # 2. Build a coherent triplet set based on the starting point and lookup dictionary
    print("Generating coherent triplet set...")
    coherent_set = sample_coherent_triplet_set(
        kg_lookup_dict=KG_dict,
        start_triplet=start_triplet,
        bias_factor=BIAS_FACTOR,
        mean_size=MEAN_TRIPLET_SET_SIZE
    )

    print(f"\nGenerated coherent triplet set of size {len(coherent_set)}:")
    if not coherent_set:
        print("  (The sampled set is empty this time.)")
    else:
        for triplet in coherent_set:
            print(f"  {triplet}")
