In [1]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
import matplotlib.pyplot as plt
import random
from typing import Dict

from torch import nn, optim, Tensor, torch

In [2]:
from ndl_similarity.data_loader import UserUserDataset
from ndl_similarity.ndl_similarity import *

### ToDo
- Calculate edge attribute weight for the matrix
    - postive edges get weight 1, negative edges get weight 0 as a pd.dataframe
- How to get the score metrics: recall and precision?

In [3]:
postings_1 = pd.read_csv("../data/Postings_01052019_15052019.csv", sep = ';')
postings_2 = pd.read_csv("../data/Postings_16052019_31052019.csv", sep = ';')
votes_1 = pd.read_csv("../data/Votes_01052019_15052019.csv", sep = ';')
votes_2 = pd.read_csv("../data/Votes_16052019_31052019.csv", sep = ';')
postings = pd.concat([postings_1, postings_2])
votes = pd.concat([votes_1, votes_2])

datasets_dict = {
    "postings": postings,
    "votes": votes
}

In [4]:
len(postings["ID_Article"].unique())

4351

In [5]:
len(postings["ArticleRessortName"].unique())

689

To understand the data structure of the data loader and to get a data frame for the mapping

In [6]:
uu_dataset = UserUserDataset(datasets_dict = datasets_dict, verbose = True)

Size of user-article interaction dataframe:  (895196, 6)
Number of unique users:  33703
Number of unique articles:  4351
Number of edges:  895196


In [7]:
user_article, users_mapping, articles_mapping = uu_dataset.outsource_information()
reverse_user_mapping = {v: k for k, v in users_mapping.items()}
reverse_article_mapping = {v: k for k, v in articles_mapping.items()}

def revMap(enc_list, map_dict):
    original = [map_dict[item.item()] for item in enc_list]
    return original

In [8]:
user_article

Unnamed: 0,ID_CommunityIdentity,ID_Article,user_id,article_id,user_id_enc,article_enc
0,30,2000102732845,u-30,a-2000102732845,7481,1107
1,30,2000103774090,u-30,a-2000103774090,7481,3547
2,30,2000104101028,u-30,a-2000104101028,7481,4239
3,38,2000102598376,u-38,a-2000102598376,8060,800
4,38,2000102673879,u-38,a-2000102673879,8060,977
...,...,...,...,...,...,...
744097,694428,2000104092603,u-694428,a-2000104092603,32296,4215
744099,694465,2000104039572,u-694465,a-2000104039572,32302,4119
744100,694465,2000104075987,u-694465,a-2000104075987,32302,4173
744101,694465,2000104146249,u-694465,a-2000104146249,32302,4316


In [9]:
train_edge_index, val_edge_index, test_edge_index = uu_dataset.get_train_test_val_split(val_split_ratio = 0.2, test_split_ratio = 0.5, random_state = 1, sparse = False)

Train edge index shape:  torch.Size([2, 716156])
Val edge index shape:  torch.Size([2, 89520])
Test edge index shape:  torch.Size([2, 89520])


In [10]:
edges_all = torch.cat([train_edge_index, val_edge_index, test_edge_index], dim=1)
edges_all.shape

torch.Size([2, 895196])

In [11]:
edges_all

tensor([[27952, 13714,  3642,  ...,  4706, 22428,  3550],
        [ 4117,  1419,  4045,  ...,   994,  3355,  1599]])

In [12]:
source, target, negative_target = uu_dataset.get_negative_samples(edges_all)

In [20]:
G = torch_to_Graph(source= revMap(edges_all[0],reverse_user_mapping), pos_target= revMap(edges_all[1], reverse_article_mapping),neg_target= None)

In [21]:
G.number_of_edges()

895196

In [22]:
G_undirected = G.copy().to_undirected()

In [23]:
degree_dict = dict(G_undirected.degree())
centrality_dict = nx.degree_centrality(G)

In [24]:
centrality_df =pd.DataFrame.from_dict(centrality_dict, orient='index')
degree_df =pd.DataFrame.from_dict(degree_dict, orient='index')
graph_features_df = pd.merge(centrality_df, degree_df, left_index=True, right_index=True)
graph_features_df.columns = ["centrality", "degree"]
graph_features_df.head()

Unnamed: 0,centrality,degree
u-680585,0.00226,86
a-2000104039148,0.015058,573
u-531534,0.000447,17
a-2000102849817,0.00862,328
u-180735,0.001077,41


In [27]:
#graph_features_df.to_csv("../data/graph_features.csv")
graph_features_df.to_csv("./ndl_similarity/graph_features.csv")