In [38]:
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
import matplotlib.pyplot as plt
import random
from typing import Dict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime

from torch import nn, optim, Tensor, torch

In [2]:
from ndl_similarity.data_loader import UserUserDataset
from ndl_similarity.ndl_similarity import *

# Data understanding and preparation
The aim of the notebook is to create the data base for the non deep learning method on top of the Dataloader by using the train_test split and the negative sampling method of the Dataloader.
In the end of the notebook, we create 3 separate notebooks for train, validation and test purposes.

The modeling will be done in a separate notebook, named *ndl_modeling.ipnyb*

In [3]:
postings_1 = pd.read_csv("../data/Postings_01052019_15052019.csv", sep = ';')
postings_2 = pd.read_csv("../data/Postings_16052019_31052019.csv", sep = ';')
votes_1 = pd.read_csv("../data/Votes_01052019_15052019.csv", sep = ';')
votes_2 = pd.read_csv("../data/Votes_16052019_31052019.csv", sep = ';')
postings = pd.concat([postings_1, postings_2])
votes = pd.concat([votes_1, votes_2])

datasets_dict = {
    "postings": postings,
    "votes": votes
}

To understand the data structure of the data loader and to get a data frame for the mapping

In [4]:
uu_dataset = UserUserDataset(datasets_dict = datasets_dict, verbose = True)

Size of user-article interaction dataframe:  (895196, 6)
Number of unique users:  33703
Number of unique articles:  4351
Number of edges:  895196


In [5]:
user_article, users_mapping, articles_mapping = uu_dataset.outsource_information()
reverse_user_mapping = {v: k for k, v in users_mapping.items()}
reverse_article_mapping = {v: k for k, v in articles_mapping.items()}

def revMap(enc_list, map_dict):
    original = [map_dict[item.item()] for item in enc_list]
    return original

In [6]:
user_article

Unnamed: 0,ID_CommunityIdentity,ID_Article,user_id,article_id,user_id_enc,article_enc
0,30,2000102732845,u-30,a-2000102732845,7481,1107
1,30,2000103774090,u-30,a-2000103774090,7481,3547
2,30,2000104101028,u-30,a-2000104101028,7481,4239
3,38,2000102598376,u-38,a-2000102598376,8060,800
4,38,2000102673879,u-38,a-2000102673879,8060,977
...,...,...,...,...,...,...
744097,694428,2000104092603,u-694428,a-2000104092603,32296,4215
744099,694465,2000104039572,u-694465,a-2000104039572,32302,4119
744100,694465,2000104075987,u-694465,a-2000104075987,32302,4173
744101,694465,2000104146249,u-694465,a-2000104146249,32302,4316


In [7]:
train_edge_index, val_edge_index, test_edge_index = uu_dataset.get_train_test_val_split(val_split_ratio = 0.2, test_split_ratio = 0.5, random_state = 1, sparse = False)

Train edge index shape:  torch.Size([2, 716156])
Val edge index shape:  torch.Size([2, 89520])
Test edge index shape:  torch.Size([2, 89520])


In [8]:
train_edge_index

tensor([[27952, 13714,  3642,  ...,  2329,  3430,  7043],
        [ 4117,  1419,  4045,  ...,   703,  2644,  1103]])

In [9]:
edges_all = torch.cat([train_edge_index, val_edge_index, test_edge_index], dim=1)
edges_all.shape

torch.Size([2, 895196])

In [10]:
G = torch_to_Graph(source= revMap(edges_all[0],reverse_user_mapping), pos_target= revMap(edges_all[1], reverse_article_mapping),neg_target= None)

In [11]:
G_undirected = G.copy().to_undirected()

In [12]:
degree_dict = dict(G_undirected.degree())
centrality_dict = nx.degree_centrality(G)

In [13]:
centrality_df =pd.DataFrame.from_dict(centrality_dict, orient='index')
degree_df =pd.DataFrame.from_dict(degree_dict, orient='index')
graph_features_df = pd.merge(centrality_df, degree_df, left_index=True, right_index=True)
graph_features_df.columns = ["centrality", "degree"]
graph_features_df.head()

Unnamed: 0,centrality,degree
u-680585,0.00226,86
a-2000104039148,0.015058,573
u-531534,0.000447,17
a-2000102849817,0.00862,328
u-180735,0.001077,41


In [14]:
#graph_features_df.to_csv("../data/graph_features.csv")
#graph_features_df.to_csv("./ndl_similarity/graph_features.csv")

In [15]:
votes.head()

Unnamed: 0,ID_CommunityIdentity,ID_Posting,VoteNegative,VotePositive,VoteCreatedAt,UserCommunityName,UserGender,UserCreatedAt
0,675862,1041076570,1,0,2019-05-06 16:47:46.883,Heckscheibenwischer,m,2018-06-26 06:04:30.513
1,689023,1041076570,1,0,2019-05-01 22:19:06.240,Heinz Fettleber,,2019-03-08 21:23:11.463
2,24810,1041076745,0,1,2019-05-01 23:54:54.600,Bruce Campbell,m,2011-01-12 16:50:40.597
3,673781,1041076745,0,1,2019-05-01 20:59:29.910,Erdäpfelsack,,2018-05-29 07:13:49.350
4,24810,1041076831,0,1,2019-05-01 23:51:42.730,Bruce Campbell,m,2011-01-12 16:50:40.597


In [16]:
postings.head()

Unnamed: 0,ID_Posting,ID_Posting_Parent,ID_CommunityIdentity,PostingHeadline,PostingComment,PostingCreatedAt,ID_Article,ArticlePublishingDate,ArticleTitle,ArticleChannel,ArticleRessortName,UserCommunityName,UserGender,UserCreatedAt
0,1041073586,1041073000.0,671476,Das hat gestern bereits der Voggenhuber angefü...,schieder hatte dem inhaltlich nichts entgegenz...,2019-05-01 18:21:15.127,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470
1,1041073839,1041073000.0,566938,,...und meinen Bezirk bekommst du als Erbe mit.,2019-05-01 18:28:22.040,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,AlphaRomeo,m,2015-08-28 17:07:41.110
2,1041073872,1041069000.0,669286,,"Nein, bei der ÖVP/FPÖ genauso passiert. Ich wo...",2019-05-01 18:29:05.533,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Hpolditsch,,2018-03-06 20:03:42.737
3,1041080734,1041080000.0,671476,Sie haben doch nichts gefordert??,sie haben nur die regierung kritisiert. das di...,2019-05-01 22:37:56.010,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470
4,1041080828,,671476,Heute wäre der perfekte Tag für die SPÖ gewese...,"ihr noch nicht erfülltes versprechen, den silb...",2019-05-01 22:42:06.310,2000102330973,2019-05-01 10:28:57.49,1. Mai in Wien: SPÖ fordert von Strache Rücktritt,Inland,Parteien,Ravenspower,,2018-04-14 13:42:28.470


# ToDo
- create a separate user and article df incl. ID, Degree, Centrality, and node specific features
- merge the two based on the train-test split
    - create new df called Edges
- (remove the IDs)
- do the binary classification

In [17]:
def add_ID_prefix(type: str, df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    if type == "user":
        df["user_enc"] = "u-" + df[target_col].astype(str)
    if type == "article":
        df["article_enc"] = "a-" + df[target_col].astype(str)

In [18]:
user_posting_df = postings[["ID_CommunityIdentity", "UserCreatedAt"]].copy()
user_votes_df = votes[["ID_CommunityIdentity", "UserCreatedAt"]].copy()
user_df = pd.concat([user_posting_df, user_votes_df])
user_df.drop_duplicates(subset=["ID_CommunityIdentity"], inplace=True)
article_df = postings[["ID_Article", "ArticlePublishingDate", "ArticleChannel", "ArticleRessortName"]].copy()
article_df.drop_duplicates(subset=["ID_Article"], inplace=True)

In [19]:
add_ID_prefix(type = "user", df = user_df, target_col = "ID_CommunityIdentity")
user_df.drop('ID_CommunityIdentity', axis=1, inplace=True)
user_df.head()
print(f"User_df shape: {user_df.shape}")
user_df = pd.merge(user_df,graph_features_df, left_on="user_enc", right_index=True, how="left")
print(f"User_df shape: {user_df.shape}")
user_df.head()


User_df shape: (33703, 2)
User_df shape: (33703, 4)


Unnamed: 0,UserCreatedAt,user_enc,centrality,degree
0,2018-04-14 13:42:28.470,u-671476,0.004178,159
1,2015-08-28 17:07:41.110,u-566938,0.003206,122
2,2018-03-06 20:03:42.737,u-669286,0.002286,87
6,2013-09-04 12:37:09.603,u-523527,0.000105,4
7,2007-04-02 20:34:49.397,u-74674,0.010485,399


In [20]:
user_df.head()

Unnamed: 0,UserCreatedAt,user_enc,centrality,degree
0,2018-04-14 13:42:28.470,u-671476,0.004178,159
1,2015-08-28 17:07:41.110,u-566938,0.003206,122
2,2018-03-06 20:03:42.737,u-669286,0.002286,87
6,2013-09-04 12:37:09.603,u-523527,0.000105,4
7,2007-04-02 20:34:49.397,u-74674,0.010485,399


In [21]:
add_ID_prefix(type = "article", df = article_df, target_col = "ID_Article")
article_df.drop('ID_Article', axis=1, inplace=True)
article_df.head()
print(f"article shape: {article_df.shape}")
article_df = pd.merge(article_df,graph_features_df, left_on="article_enc", right_index=True, how="left")
print(f"article shape: {article_df.shape}")
article_df.head()


article shape: (4351, 4)
article shape: (4351, 6)


Unnamed: 0,ArticlePublishingDate,ArticleChannel,ArticleRessortName,article_enc,centrality,degree
0,2019-05-01 10:28:57.49,Inland,Parteien,a-2000102330973,0.048853,1859
13,2019-05-03 11:45:14.00,Inland,Sachpolitik,a-2000102386715,0.073634,2802
19,2019-05-01 08:00:00.00,Meinung,Kommentare der anderen,a-2000102257779,0.013429,511
20,2019-05-02 09:30:02.84,International,Venezuela,a-2000102370556,0.005492,209
29,2019-05-01 08:00:00.00,Inland,SPÖ,a-2000102263929,0.013981,532


In [22]:
def create_edge_df(uu_dataset, edge_index, user_df, article_df):
    """_summary_

    Args:
        uu_dataset (Dataloader): The dataloader file. It should contain the get_negative_samples method.
        edge_index (torch.Tensor): It is the output of the get_train_test_val_split method of the dataloader.
        user_df (pd.DataFrame): Containing features about the users.
        article_df (pd.DataFrame): Containing features about the articles.

    Returns:
        edges_df (pd.DataFrame): The dataframe containing the edges and the features of the nodes
    """
    source, pos_target, neg_target = uu_dataset.get_negative_samples(edge_index)
    # create the edge dataframe
    df_pos = pd.DataFrame()
    df_neg = pd.DataFrame()
    df_pos["user"] = np.array(revMap(source, reverse_user_mapping))
    df_pos["article"] = np.array(revMap(pos_target, reverse_article_mapping))
    df_pos["Existing_edge"] = True
    df_neg["user"] = np.array(revMap(source, reverse_user_mapping))
    df_neg["article"] = np.array(revMap(neg_target, reverse_article_mapping))
    df_neg["Existing_edge"] =False

    edge_df = pd.concat([df_pos, df_neg])
    
    # add the user and article features
    edge_df = pd.merge(edge_df, user_df, left_on="user", right_on="user_enc", how="left")
    edge_df.drop('user_enc', axis=1, inplace=True)
    edge_df = pd.merge(edge_df, article_df, left_on="article", right_on="article_enc", how="left")
    edge_df.drop('article_enc', axis=1, inplace=True)
    
    edge_df["UserCreatedAt"] = pd.to_datetime(edge_df["UserCreatedAt"], format='%Y-%m-%d %H:%M:%S.%f')
    edge_df["ArticlePublishingDate"] = pd.to_datetime(edge_df["ArticlePublishingDate"], format='%Y-%m-%d %H:%M:%S.%f')
    edge_df.rename(columns={"centrality_x": "centrality_user", "centrality_y": "centrality_article"}, inplace=True)
    edge_df.rename(columns={"degree_x": "degree_user", "degree_y": "degree_article"}, inplace=True)
    
    return edge_df

In [23]:
train_edge_df = create_edge_df(uu_dataset, train_edge_index, user_df, article_df)
val_edge_df = create_edge_df(uu_dataset, val_edge_index, user_df, article_df)
test_edge_df = create_edge_df(uu_dataset, test_edge_index, user_df, article_df)

In [24]:
train_edge_df.to_csv("../data/train_edge_df.csv", index=False)
val_edge_df.to_csv("../data/val_edge_df.csv", index=False)
test_edge_df.to_csv("../data/test_edge_df.csv", index=False)