### Nikolaos Giannopoulos AM 5199
### Team: Trump Tariffed My Datasets

In [1]:
import pickle
from karateclub import Walklets
import numpy as np
import networkx as nx
import time
from itertools import combinations

with open('Data/authors_preprocessing.pkl', 'rb') as f:
    raw_author_dict = pickle.load(f)

#Now extract the values (each is a string representing a list of authors)
raw_author_lists = list(raw_author_dict.values())

In [4]:
#Read citation graph from edgelist
G_citation = nx.read_edgelist('edgelist.txt', 
                              delimiter=',', 
                              create_using=nx.DiGraph(),  # Citations are directional
                              nodetype=int)

print(f"Citation Graph: {G_citation.number_of_nodes()} papers, {G_citation.number_of_edges()} citations")
print("Sample edges:", list(G_citation.edges())[:5])

Citation Graph: 138499 papers, 1091955 citations
Sample edges: [(0, 1), (0, 2), (1, 3), (1, 5), (1, 6)]


In [8]:
start_time = time.time()

def return_embeddings(G, model, parameters):
    emb = model(**parameters)
    emb.fit(G)
    return emb.get_embedding()
parameters = {
    'walk_number': 10,
    'walk_length': 80,
    'dimensions': 64,
    'window_size': 5,
    'workers': 10
}
print('Learning embedding with parameters')
print(parameters)

G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.DiGraph(), nodetype=int)

#Create and save embeddings
embeddings = return_embeddings(G, Walklets, parameters)

output_path = 'embedding_Walklets_edgelist_wn{:d}_wl{:d}_d{:d}_ws{:d}.npy'.format(
    parameters['walk_number'],
    parameters['walk_length'],
    parameters['dimensions'],
    parameters['window_size']
)

np.save(output_path, embeddings)

print(f"Embedding learned and saved to {output_path}")
print("--- %s seconds ---" % (time.time() - start_time))

Learning embedding with parameters
{'walk_number': 10, 'walk_length': 80, 'dimensions': 64, 'window_size': 5, 'workers': 10}
Embedding learned and saved to embedding_Walklets_edgelist_wn10_wl80_d64_ws5.npy
--- 554.4703607559204 seconds ---
