In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from node2vec import Node2Vec

# Load the KG

In [2]:
path = '../Datasets/MetaQA_dataset/'

df = pd.read_csv(path+'kb.txt', sep='|', header=None, names=['entity1', 'relation', 'entity2'])
df.head()

Unnamed: 0,entity1,relation,entity2
0,Kismet,directed_by,William Dieterle
1,Kismet,written_by,Edward Knoblock
2,Kismet,starred_actors,Marlene Dietrich
3,Kismet,starred_actors,Edward Arnold
4,Kismet,starred_actors,Ronald Colman


In [3]:
df_unique = df.drop_duplicates()
len(df_unique)

133582

Note: The NetworkX graph is constructed using edges only and does not account for nodes without links (as present in kb_entity_dict.txt). I'm assuming these isolated nodes do not provide useful embeddings based on graph topology.

In [4]:
# Use MultiGraph to allow multiple edges
G = nx.from_pandas_edgelist(df_unique, source='entity1', target='entity2', edge_attr='relation', create_using=nx.MultiGraph())
# G = nx.from_pandas_edgelist(df_unique, source='entity1', target='entity2', edge_attr='relation', create_using=nx.MultiDiGraph())

In [5]:
num_nodes = G.number_of_nodes()
print(f"Number of entities: {num_nodes}")

num_edges = G.number_of_edges()
print(f"Number of edges: {num_edges}")

distinct_relations = set(nx.get_edge_attributes(G, 'relation').values())
print(f"Number of distinct relations: {len(distinct_relations)}")
print("Distinct relations:", distinct_relations)

Number of entities: 43234
Number of edges: 133582
Number of distinct relations: 9
Distinct relations: {'has_tags', 'has_imdb_rating', 'written_by', 'has_genre', 'directed_by', 'has_imdb_votes', 'in_language', 'starred_actors', 'release_year'}


# Node2Vec Embeddings

In [6]:
# Create Node2Vec model from the graph
node2vec = Node2Vec(G, dimensions=64, walk_length=10, num_walks=100, workers=4)

# Fit model to generate node embeddings
model = node2vec.fit(window=10, min_count=1, batch_words=4)

Computing transition probabilities:   0%|          | 0/43234 [00:00<?, ?it/s]

In [7]:
# Save embeddings for later use
model.wv.save_word2vec_format('ud_node2vec_embeddings.txt')
# model.wv.save_word2vec_format('d_node2vec_embeddings.txt')

# Save model for later use
model.save('ud_node2vec_model.model')
# model.wv.save_word2vec_format('d_node2vec_embeddings.txt')

# Load the saved model

In [6]:
from gensim.models import Word2Vec

In [7]:
model = Word2Vec.load('ud_node2vec_model.model')
# model = Word2Vec.load('d_node2vec_model.model')

In [8]:
# Look for most similar nodes
query_node = 'ginger rogers'
model.wv.most_similar(query_node)

[('Kitty Foyle', 0.8982051014900208),
 ('Christopher Morley', 0.8923709392547607),
 ('The Barkleys of Broadway', 0.8849673867225647),
 ('Top Hat', 0.8806878328323364),
 ('Erik Rhodes', 0.8490828275680542),
 ('fred astaire', 0.8193020820617676),
 ('eric blore', 0.8133012056350708),
 ('Ginger Rogers', 0.8102369904518127),
 ('Fred Astaire', 0.8049663305282593),
 ('James Craig', 0.789457380771637)]