In [3]:
from helpers import get_all_metadata, handle_empty
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.nn import GATConv
from torch_geometric.datasets import CitationFull
from torch_geometric.data import Data, DataLoader
from torch.utils.data import Dataset, DataLoader as TorchDataLoader
import networkx as nx
import pickle
from tqdm import tqdm, trange
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer
from sklearn.metrics import mean_absolute_percentage_error

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class GATRegression(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4):
        super(GATRegression, self).__init__()
        self.gat1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.6)
        self.gat2 = GATConv(hidden_channels * heads, out_channels, heads=1, concat=False)
    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = self.gat2(x, edge_index)
        return x

In [6]:
graph = pickle.load(open("../data/graph_medium.pickle", "rb"))

In [4]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
scaler = MinMaxScaler()

In [18]:
node_labels = dict()
for node in tqdm(graph.nodes):
    metadata = get_all_metadata(node)
    metadata["authors"] = ','.join(metadata["authors"]) if metadata["authors"] and len(metadata["authors"]) > 1 else metadata["authors"]
    node_labels[node] = metadata

100%|██████████| 17778/17778 [6:07:48<00:00,  1.24s/it]  


In [19]:
for elem in node_labels:
    if node_labels[elem]["authors"]:
        if len(node_labels[elem]["authors"]) == 1:
            node_labels[elem]["authors"] = node_labels[elem]["authors"][0]

In [20]:
titles = [handle_empty(node_labels[obj]["title"]) for obj in node_labels]
authors = [handle_empty(node_labels[obj]["authors"]) for obj in node_labels]
venues = [handle_empty(node_labels[obj]["venue"]) for obj in node_labels]
years = [node_labels[obj]["year"][0] if node_labels[obj]["year"] and node_labels[obj]["year"][0] else 0 for obj in node_labels]
abstracts = [handle_empty(node_labels[obj]["abstract"]) for obj in node_labels]
keywords = [handle_empty(node_labels[obj]["keywords"]) for obj in node_labels]


10430

In [21]:
np.savetxt("years.csv", years, delimiter=',')

In [22]:
title_embeddings = sentence_model.encode(titles)
author_embeddings = sentence_model.encode(authors)
venue_embeddings = sentence_model.encode(venues)
abstract_embeddings = sentence_model.encode(abstracts)
keyword_embeddings = tfidf_vectorizer.fit_transform(keywords).toarray()
years_normalized = scaler.fit_transform(np.array(years).reshape(-1, 1))

encoded_data = np.hstack([
    title_embeddings,
    author_embeddings,
    venue_embeddings,
    years_normalized,
    abstract_embeddings,
    keyword_embeddings
])
print(encoded_data.shape)

(17778, 1538)


In [23]:
encoded_data_no_year = np.hstack([
    title_embeddings,
    author_embeddings,
    venue_embeddings,
    abstract_embeddings,
    keyword_embeddings
])

In [24]:
np.savetxt("labels_no_year.csv", encoded_data_no_year, delimiter=',')

In [25]:
np.savetxt("labels.csv", encoded_data, delimiter=',')

In [3]:
encoded_data = np.genfromtxt("labels2.csv", delimiter=',')
encoded_data_no_year = np.genfromtxt("labels_no_year2.csv", delimiter=',')
years = np.genfromtxt("years2.csv", delimiter=',')

In [4]:
years=np.genfromtxt("years.csv", delimiter=',')

In [4]:
print(np.mean(np.array(years)))
print((np.array(years) == 2016).sum())

1105.0269434132074
29


In [27]:
encoded_data.shape

(17778, 1538)

In [8]:
node_list = list(graph.nodes())
node_mapping = {node: idx for idx, node in enumerate(node_list)}
graph = nx.relabel_nodes(graph, node_mapping)

In [9]:
years_dict = dict(enumerate(years))
nx.set_node_attributes(graph, years_dict, name="year")

In [10]:
from torch_geometric.utils import from_networkx

pyg_data = from_networkx(graph.to_undirected())


In [11]:
def create_dataset(g: nx.Graph, graph, node_labels, target_function):
    x = torch.tensor(node_labels, dtype=torch.float)
    edge_index = graph.edge_index
    y = torch.tensor(list(target_function(g).values()), dtype=torch.float)
    return Data(x=x, edge_index=edge_index, y=y)

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [12]:
device = torch.device("cpu")

In [13]:
dataset_pr = create_dataset(graph, pyg_data, encoded_data, nx.pagerank).to(device)
dataset_harmonic = create_dataset(graph, pyg_data, encoded_data, nx.harmonic_centrality).to(device)

In [13]:
test_nodes = [n for n, attr in graph.nodes(data=True) if attr.get('year') == 2016]
train_nodes = list(set(list(range(len(graph.nodes)))).difference(set(test_nodes)))

In [15]:
train_mask = (torch.zeros(len(graph.nodes), dtype=torch.bool).to(device))
train_mask[train_nodes] = True
test_mask = (torch.zeros(len(graph.nodes), dtype=torch.bool).to(device))
test_mask[test_nodes] = True

In [18]:
model = GATRegression(in_channels=1538, hidden_channels=2048, out_channels=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.MSELoss()

In [19]:
def train(epochs, data, model, optimizer, loss_fn, train_mask, test_mask):
    for epoch in trange(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index).squeeze()
        loss = loss_fn(out[train_mask], data.y[train_mask])
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0 or epoch == epochs - 1:
            print(f"Loss: {loss.item():.4f}")
    model.eval()
    with torch.no_grad():
        predictions = model(data.x, data.edge_index).squeeze()
        test_loss = loss_fn(predictions[test_mask], data.y[test_mask])
        mape = mean_absolute_percentage_error(np.array(data.y[test_mask]), np.array(predictions[test_mask]))
        print(f'Test Loss: MSE = {test_loss.item()}, MAPE = {mape}')

In [20]:
train(10, dataset_pr, model, optimizer, loss_fn, train_mask, test_mask)

  0%|          | 0/10 [00:00<?, ?it/s]

 10%|█         | 1/10 [00:20<03:03, 20.41s/it]

Loss: 0.0009


100%|██████████| 10/10 [01:59<00:00, 11.93s/it]

Loss: 24.6703





Test Loss: MSE = 89.1614761352539, MAPE = 167157.125


  mape = mean_absolute_percentage_error(np.array(data.y[test_mask]), np.array(predictions[test_mask]))


In [23]:
torch.std(dataset_pr.y)

tensor(9.9862e-06)

In [21]:
train(10, dataset_harmonic, model, optimizer, loss_fn, train_mask, test_mask)

 10%|█         | 1/10 [00:10<01:32, 10.26s/it]

Loss: 211.6375


100%|██████████| 10/10 [02:14<00:00, 13.46s/it]

Loss: 75.7704





Test Loss: MSE = 107.71257781982422, MAPE = 5.880670547485352


  mape = mean_absolute_percentage_error(np.array(data.y[test_mask]), np.array(predictions[test_mask]))


In [22]:
torch.std(dataset_harmonic.y)

tensor(6.6315)

Second model using time series predictions

In [24]:
class GATEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, heads):
        super().__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=heads)
        self.conv2 = GATConv(hidden_dim*heads, hidden_dim, heads=1)
        
    def forward(self, x, edge_index):
        x = F.elu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

In [25]:
class TemporalModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

In [26]:
class GraphTemporalPipeline(nn.Module):
    def __init__(self, gat_encoder, temporal_model):
        super().__init__()
        self.gat_encoder = gat_encoder
        self.temporal_model = temporal_model
        
    def forward(self, graphs):
        # Encode each graph
        embeddings = []
        for graph in graphs:
            emb = self.gat_encoder(graph.x, graph.edge_index)
            emb = emb.mean(dim=0)
            embeddings.append(emb)
        
        # Stack embeddings over time [num_years, num_nodes, embedding_dim]
        temporal_input = torch.stack(embeddings, dim=0).unsqueeze(0)
        
        # Predict centrality measures
        return self.temporal_model(temporal_input)

In [27]:
num_nodes = encoded_data_no_year.shape[0]
num_features = encoded_data_no_year.shape[1]
hidden_dim = 2048
num_heads = 1
years_range = list(range(2001, 2015))

In [None]:
gat = GATEncoder(num_features, hidden_dim, num_heads)
temporal = TemporalModel(hidden_dim, hidden_dim, num_nodes)
pipeline = GraphTemporalPipeline(gat, temporal)

In [29]:
def load_graph(year, graph, encodings, target_func):
    filtered_nodes = [n for n, attr in graph.nodes(data=True) if 0 < attr.get('year') <= year]
    filtered_edges = [(u, v) for u,v in graph.edges if u in filtered_nodes and v in filtered_nodes]
    subgraph = nx.Graph()
    subgraph.add_nodes_from(graph.nodes)
    subgraph.add_edges_from(filtered_edges)
    # subgraph = graph.subgraph(filtered_nodes)
    pyg_data = from_networkx(subgraph)
    x = torch.tensor(np.array(encodings), dtype=torch.float)
    edge_indices = pyg_data.edge_index
    y = torch.tensor(list(target_func(subgraph).values()), dtype=torch.float)
    return Data(x=x, edge_index=edge_indices, y=y)

In [30]:
graphs = [load_graph(year, graph, encoded_data_no_year, nx.pagerank) for year in years_range]
sequences = []
for i in range(len(graphs) - 1):
    sequence = graphs[:i+1]
    target = graphs[i+1]
    sequences.append((sequence, target))

In [31]:
graphs_harmonic = [load_graph(year, graph, encoded_data_no_year, nx.harmonic_centrality) for year in years_range]
sequences_harmonic = []
for i in range(len(graphs_harmonic) - 1):
    sequence = graphs_harmonic[:i+1]
    target = graphs_harmonic[i+1]
    sequences_harmonic.append((sequence, target))

In [32]:
optimizer = torch.optim.Adam(pipeline.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [33]:
# dataset_test = YearlyGraphDataset(years_range.append(2025), graph, encoded_data, nx.pagerank)
# historical_graphs = dataset_test.graphs[:-1]
# test_subgraph = dataset_test.graphs[-1]
# historical_embeds = [gat(g.x, g.edge_index) for g in historical_graphs]
# test_embed = gat(test_subgraph.x, test_subgraph.edge_index)

In [34]:
def train_lstm(epochs, seqs, model, optimizer, criterion):
    for _ in trange(epochs):
        for sequence, target in seqs:
            optimizer.zero_grad()
            preds = model(sequence)
            loss = criterion(preds, target.y.unsqueeze(0))
            loss.backward()
            optimizer.step()
        print(f"Loss: {loss.item():.4f}")

In [None]:
# def test(graph, target_func, model, dataset, loss_fn):
#     model.eval()
#     with torch.no_grad():
#         filtered_nodes = [n for n, attr in graph.nodes(data=True) if 0 < attr.get('year') <= 2025]
#         subgraph = graph.subgraph(filtered_nodes)
#         target_values = np.array(target_func(subgraph).values())[filtered_nodes]
#         predictions = model(dataset.x, dataset.edge_index).squeeze()[filtered_nodes]
#         res = loss_fn(predictions, target_values)
#     return res

In [None]:
train_lstm(10, sequences, pipeline, optimizer, criterion)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
graphs.append(load_graph(2016, graph, encoded_data_no_year, nx.pagerank))
sequences.append([graphs[:-1], graphs[-1]])
pipeline.eval()
with torch.no_grad():
    preds = pipeline(sequences[-1][0])
    loss = criterion(preds, sequences[-1][1].y)
    print(f"Loss on test = {loss.item()}")

Loss on test = 6.088639281642827e-08


  return F.mse_loss(input, target, reduction=self.reduction)


In [None]:
torch.std(sequences[-1][1].y)

tensor(0.0005)

In [32]:
len(years)

10430

In [None]:
train_lstm(10, sequences_harmonic, pipeline, optimizer, criterion)

  2%|▎         | 1/40 [07:33<4:54:54, 453.70s/it]

Loss: 34592.9062


  5%|▌         | 2/40 [13:37<4:13:45, 400.66s/it]

Loss: 29877.0098


  8%|▊         | 3/40 [18:41<3:39:50, 356.50s/it]

Loss: 25632.2969


 10%|█         | 4/40 [23:36<3:19:24, 332.34s/it]

Loss: 21923.3477


 12%|█▎        | 5/40 [28:33<3:06:22, 319.50s/it]

Loss: 18709.4805


 15%|█▌        | 6/40 [33:33<2:57:17, 312.87s/it]

Loss: 15932.3525


 18%|█▊        | 7/40 [37:53<2:42:35, 295.61s/it]

Loss: 13540.8594


 20%|██        | 8/40 [42:11<2:31:21, 283.79s/it]

Loss: 11484.5693


 22%|██▎       | 9/40 [46:29<2:22:29, 275.78s/it]

Loss: 9720.4004


 25%|██▌       | 10/40 [50:48<2:15:11, 270.37s/it]

Loss: 8210.2832


 28%|██▊       | 11/40 [55:06<2:08:54, 266.69s/it]

Loss: 6920.6084


 30%|███       | 12/40 [59:24<2:03:16, 264.17s/it]

Loss: 5821.7866


 32%|███▎      | 13/40 [1:03:42<1:57:57, 262.15s/it]

Loss: 4887.8145


 35%|███▌      | 14/40 [1:07:59<1:52:58, 260.73s/it]

Loss: 4095.8777


 38%|███▊      | 15/40 [1:12:16<1:48:10, 259.62s/it]

Loss: 3426.0100


 40%|████      | 16/40 [1:16:34<1:43:33, 258.88s/it]

Loss: 2860.7917


 42%|████▎     | 17/40 [1:20:51<1:39:02, 258.36s/it]

Loss: 2385.0676


 45%|████▌     | 18/40 [1:25:08<1:34:34, 257.95s/it]

Loss: 1985.6947


 48%|████▊     | 19/40 [1:29:25<1:30:10, 257.66s/it]

Loss: 1651.3024


 50%|█████     | 20/40 [1:33:42<1:25:51, 257.56s/it]

Loss: 1372.0796


 52%|█████▎    | 21/40 [1:37:59<1:21:30, 257.40s/it]

Loss: 1139.5837


 55%|█████▌    | 22/40 [1:42:16<1:17:10, 257.26s/it]

Loss: 946.5646


 57%|█████▊    | 23/40 [1:46:33<1:12:51, 257.15s/it]

Loss: 786.8124


 60%|██████    | 24/40 [1:50:52<1:08:45, 257.86s/it]

Loss: 655.0202


 62%|██████▎   | 25/40 [1:55:17<1:04:57, 259.83s/it]

Loss: 546.6632


 65%|██████▌   | 26/40 [1:59:35<1:00:30, 259.30s/it]

Loss: 457.8926


 68%|██████▊   | 27/40 [2:03:52<56:03, 258.69s/it]  

Loss: 385.4426


 70%|███████   | 28/40 [2:08:09<51:38, 258.23s/it]

Loss: 326.5485


 72%|███████▎  | 29/40 [2:12:31<47:32, 259.31s/it]

Loss: 278.8766


 75%|███████▌  | 30/40 [2:16:59<43:38, 261.85s/it]

Loss: 240.4620


 78%|███████▊  | 31/40 [2:21:21<39:15, 261.77s/it]

Loss: 209.6553


 80%|████████  | 32/40 [2:25:39<34:47, 260.90s/it]

Loss: 185.0765


 82%|████████▎ | 33/40 [2:29:58<30:22, 260.30s/it]

Loss: 165.5743


 85%|████████▌ | 34/40 [2:34:17<25:59, 259.93s/it]

Loss: 150.1922


 88%|████████▊ | 35/40 [2:38:37<21:39, 259.82s/it]

Loss: 138.1382


 90%|█████████ | 36/40 [2:42:56<17:18, 259.62s/it]

Loss: 128.7590


 92%|█████████▎| 37/40 [2:47:15<12:58, 259.38s/it]

Loss: 121.5185


 95%|█████████▌| 38/40 [2:51:34<08:38, 259.22s/it]

Loss: 115.9784


 98%|█████████▊| 39/40 [2:55:52<04:19, 259.02s/it]

Loss: 111.7819


100%|██████████| 40/40 [3:00:11<00:00, 270.29s/it]

Loss: 108.6403





In [None]:
graphs_harmonic.append(load_graph(2016, graph, encoded_data_no_year, nx.harmonic_centrality))
sequences_harmonic.append([graphs_harmonic[:-1], graphs_harmonic[-1]])
pipeline.eval()
with torch.no_grad():
    preds = pipeline(sequences_harmonic[-1][0])
    loss = criterion(preds, sequences_harmonic[-1][1].y.unsqueeze(0))
    print(f"Loss on test = {loss.item()}")

Loss on test = 137.4388427734375


In [31]:
torch.std(sequences_harmonic[-1][1].y)

tensor(163.2631)

Как можем заметить, вторая модель предсказывает неплохо хотя бы относительно среднего отклонения.

Но для чистоты эскперимента непосредственно с моделями, хотелось бы проверить их работу на открытых датасетах и сравнить результат со статьей про DGINI

Потом я понял, что они решали немного другую задачи, и мне пришлось бы слишком сильно менять структуру модели, чтобы это учесть

In [11]:
from torch_geometric.datasets import AMiner

In [13]:
dataset = AMiner(root="../data/AMiner_dataset")

Downloading https://www.dropbox.com/s/1bnz8r7mofx0osf/net_aminer.zip?dl=1
Extracting ../data/AMiner_dataset/net_aminer.zip
Downloading https://www.dropbox.com/s/nkocx16rpl4ydde/label.zip?dl=1
Extracting ../data/AMiner_dataset/raw/label.zip
Processing...
Done!


In [17]:
dataset.data

HeteroData(
  author={
    y=[246678],
    y_index=[246678],
    num_nodes=1693531,
  },
  venue={
    y=[134],
    y_index=[134],
    num_nodes=3883,
  },
  paper={ num_nodes=3194405 },
  (paper, written_by, author)={ edge_index=[2, 9323605] },
  (author, writes, paper)={ edge_index=[2, 9323605] },
  (paper, published_in, venue)={ edge_index=[2, 3194405] },
  (venue, publishes, paper)={ edge_index=[2, 3194405] }
)

In [18]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("abdullahdekebobeketa/aminer-citation-ntk-dataset-v11")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/abdullahdekebobeketa/aminer-citation-ntk-dataset-v11?dataset_version_number=1...


100%|██████████| 3.75G/3.75G [02:45<00:00, 24.3MB/s]

Extracting files...





Path to dataset files: /home/vunz/.cache/kagglehub/datasets/abdullahdekebobeketa/aminer-citation-ntk-dataset-v11/versions/1


In [25]:
import json

data = []
counter = 0
with open("../data/dblp_papers_v11.txt", "r") as f:
    for line in f:
        data.append(json.loads(line))
        counter += 1
        if counter == 100000:
            break

In [32]:
data[1].keys()

dict_keys(['id', 'title', 'authors', 'venue', 'year', 'n_citation', 'page_start', 'page_end', 'doc_type', 'publisher', 'volume', 'issue', 'doi', 'references', 'indexed_abstract', 'fos'])

In [4]:
data = CitationFull(root="../data/datasets", name="dblp")

In [13]:
data._data

Data(x=[17716, 1639], edge_index=[2, 105734], y=[17716])

In [14]:
dataset = np.load("../data/datasets/dblp/raw/dblp.npz", allow_pickle=True)

In [15]:
dataset.files

['adj_data',
 'attr_indices',
 'adj_indices',
 'adj_shape',
 'adj_indptr',
 'attr_data',
 'idx_to_attr',
 'attr_indptr',
 'labels',
 'attr_shape',
 'idx_to_node',
 'attr_text']

In [16]:
arrs = []
for file in dataset.files:
    arrs.append(dataset[file])

In [23]:
arrs[5]

array([0.25524727, 0.57424884, 0.41145328, ..., 0.34304865, 0.2042381 ,
       0.48243084], shape=(92192,))