In [1]:
# Author:  Jinkai Wang
import networkx as nx
import random




def directed_random_walk_sampling(graph, seed_nodes, walk_length=10, sample_size=20000, p_restart=0.7):

    """
    在有向图上执行改进的随机游走采样
    - 避免走到无出度节点时卡住
    - 允许一定概率随机跳跃到新的种子节点
    """
    sampled_nodes = set(seed_nodes)

    for node in seed_nodes:
        current_node = node
        for _ in range(walk_length):
            if random.random() < p_restart:
                current_node = random.choice(seed_nodes)  # 以一定概率重置到起始节点
            else:
                neighbors = list(graph.successors(current_node))  # 只看出边
                if not neighbors:
                    break  # 没有出边就终止当前游走
                current_node = random.choice(neighbors)

            sampled_nodes.add(current_node)
            if len(sampled_nodes) >= sample_size:
                break
        if len(sampled_nodes) >= sample_size:
            break

    return graph.subgraph(sampled_nodes)

G = nx.read_edgelist("../data/YAGO43kET/KG_train.txt", create_using=nx.DiGraph(), nodetype=str, data=[("relation", str)])

num_hubs = 300  #inclure ici les n premiers grand noeud qui porte beaucoup darc sortant.
high_outdegree_nodes = sorted(G.out_degree, key=lambda x: x[1], reverse=True)[:num_hubs]
hub_seeds = [node for node, _ in high_outdegree_nodes]
print(hub_seeds)
valid_seeds = [node for node in G.nodes() if len(list(G.successors(node))) > 0]
seed_nodes1 = random.sample(valid_seeds, 3000)  # ici on choisit les nombres voulus des autres point de part.
seed_nodes = hub_seeds + seed_nodes1# la somme des des deux noeuds initiaux forme la taille de graphe. vous pouvez changer comme vous voulez





G_sampled = directed_random_walk_sampling(G, seed_nodes, walk_length=10, sample_size=20000, p_restart=0.7) # ici vous choisiez les parametres de echantilloner
#walk length plus grand vous aurez le graph plus global
#p restard pour resortir quand on rencontre un chemin termine ou un entite avec beaucoup darc sortant.



print(f"Graph sampled has {G_sampled.number_of_nodes()} nodes and {G_sampled.number_of_edges()} edges")
print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")



def save_triplets_to_txt(edges, file_path="triplets.txt"):
    with open(file_path, "w", encoding="utf-8") as f:
        for head, relation, tail in edges:
            f.write(f"{head}\t{relation}\t{tail}\n")

edges = [(u, v, d["relation"]) for u, v, d in G_sampled.edges(data=True)]

save_triplets_to_txt(edges)

['Henry_Fonda', 'Germany', 'Belgium', 'Spain', 'Jean-Luc_Godard', 'Austria', 'Murray_Gell-Mann', 'Israel', 'Paul_Erdős', 'United_Kingdom', 'Cuba', 'Mauritania', 'Friedrich_Hirzebruch', 'Vannevar_Bush', 'Niels_Bohr', 'Otto_Hahn', 'Denmark', 'Lev_Landau', 'Pakistan', 'Steven_Spielberg', 'Robert_Solow', 'Switzerland', 'Hans_Bethe', 'Wassily_Leontief', 'Russia', 'Egypt', 'Croatia', 'Douglas_Engelbart', 'Ivory_Coast', 'Finland', 'Henri_Poincaré', 'France', 'Jacques_Chaban-Delmas', 'Kenneth_G._Wilson', 'Richard_Feynman', 'Czech_Republic', 'Nawaz_Sharif', 'Paul_Dirac', 'John_F._Kennedy', 'Portugal', 'Eugene_Wigner', 'Francis_Ford_Coppola', 'Burundi', 'Wolfgang_Pauli', 'Indonesia', 'Carl_Sagan', 'United_Arab_Emirates', 'Oman', 'El_Salvador', 'André_Weil', 'Rwanda', 'Gerald_Ford', 'Carl_Friedrich_Gauss', 'Sweden', 'Namibia', 'Aníbal_Cavaco_Silva', 'Luxembourg', 'Afghanistan', 'Zimbabwe', 'Seymour_Benzer', 'Justus_von_Liebig', 'Gustav_Kirchhoff', 'Paul_Samuelson', 'Konrad_Adenauer', 'Lazarus_Fuc