In [None]:
import pickle
import pandas as pd
from collections import Counter
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
from stellargraph import StellarGraph
from stellargraph.mapper import (
    CorruptedGenerator,
    FullBatchNodeGenerator,
    GraphSAGENodeGenerator,
    HinSAGENodeGenerator,
    HinSAGELinkGenerator,
    ClusterNodeGenerator,
    RelationalFullBatchNodeGenerator
)
from stellargraph import StellarGraph
from stellargraph.layer import DeepGraphInfomax, GAT, GCN, RGCN, HinSAGE
from stellargraph.utils import plot_history
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras import Model
from sklearn.preprocessing import normalize

In [None]:
dataset = pickle.load(open('raw/DBLP/dblp_preprocessed_dataset.pkl', 'rb'))
docID_venue = pickle.load(open('raw/DBLP/documentID_venue.pkl', 'rb'))

In [None]:
author_label = pd.DataFrame(columns=['author_id', 'label', 'author_name', 'features'])
paper_author = pd.DataFrame(columns=['paper_id', 'author_id'])
paper_conf = pd.DataFrame(columns=['paper_id', 'conf_id'])
paper_term = pd.DataFrame(columns=['paper_id', 'term_id'])
papers = pd.DataFrame(columns=['paper_id', 'paper_title', 'features'])
terms = pd.DataFrame(columns=['term_id', 'term', 'features'])
confs = pd.DataFrame(columns=['conf_id', 'conf', 'features'])

In [None]:
# Cleaning venue names
publication_list = ['sigmod', 'vldb', 'icde', 'icdt', 'edbt', 'pods', 'kdd', 'www',
                      'sdm', 'pkdd', 'icdm', 'cikm', 'aaai', 'icml', 'ecml', 'colt',
                      'uai', 'soda', 'focs', 'stoc', 'stacs']


for i, record in enumerate(docID_venue):
    venue = record[1]
    for pub in publication_list:
        if pub in venue.lower():
            docID_venue[i][1] = pub

docID_venue_dict = {}
for record in docID_venue:
    docID_venue_dict.update({record[0]:record[1]})

In [None]:
authors_counter = Counter()
terms_counter = Counter()
#authorID_feature = {}
skillID_feature = {}
venue_feature = {}
for record in dataset:
    paper_id = record[0]
    skillVector = record[1].todense()
    skillIdx = skillVector.nonzero()[1]
    terms_counter.update(skillIdx)
    authorVector = record[2].todense()
    authorIdx = record[2].todense().nonzero()[1]
    authors_counter.update(authorIdx)
    
    for authorId in authorIdx:
        paper_author = paper_author.append({'paper_id': paper_id, 'author_id': authorId}, ignore_index=True)
        
#         if authorId not in authorID_feature.keys():
#             authorID_feature.update({authorId: []})
#         authorID_feature[authorId].append(skillVector)
    
    for skillId in skillIdx:
        paper_term = paper_term.append({'paper_id': paper_id, 'term_id': skillId}, ignore_index=True)
        
        if skillId not in skillID_feature.keys():
            skillID_feature.update({skillId: []})
        skillID_feature[skillId].append(authorVector)
        
    papers = papers.append({'paper_id': paper_id, 'paper_title': 'na', 'features': authorVector}, ignore_index=True)
    
    target_venue = docID_venue_dict[paper_id]
    if target_venue not in venue_feature.keys():
        venue_feature.update({target_venue: []})
    venue_feature[target_venue].append(authorVector)

In [None]:
unique_authors_idx = list(authors_counter.keys())
for unique_authors_id in unique_authors_idx:
    author_label = author_label.append({'author_id': unique_authors_id, 'label': -1, 'author_name': 'na'}, ignore_index=True)

In [None]:
unique_terms_idx = list(terms_counter.keys())
for unique_terms_id in unique_terms_idx:
    if len(skillID_feature[unique_terms_id]) == 1:
        term_features = skillID_feature[unique_terms_id]
    else:
        term_features = np.sum(skillID_feature[unique_terms_id], axis=0)
    terms = terms.append({'term_id': unique_terms_id, 'term': 'na', 'features': term_features}, ignore_index=True)

In [None]:
conf_counter = Counter()
for record in docID_venue:
    paper_id = record[0]
    conf_counter.update([record[1]])
venues = list(conf_counter.keys())

In [None]:
conf_confID = {}
for i, venue in enumerate(venues):
    if len(venue_feature[venue]) == 1:
        venue_features = venueID_feature[i]
    else:
        venue_features = np.sum(venue_feature[venue], axis=0)
    confs = confs.append({'conf_id': i, 'conf': venue, 'features': venue_features}, ignore_index=True)
    conf_confID.update({venue: i})

In [None]:
for record in docID_venue:
    paper_id = record[0]
    conf_id = conf_confID[record[1]]
    paper_conf = paper_conf.append({'paper_id': paper_id, 'conf_id': conf_id}, ignore_index=True)

In [None]:
authors_counter = Counter()
terms_counter = Counter()
for record in dataset:
    paper_id = record[0]
    skillIdx = record[1].todense().nonzero()[1]
    terms_counter.update(skillIdx)
    authorIdx = record[2].todense().nonzero()[1]
    authors_counter.update(authorIdx)

In [None]:
authors = author_label['author_id'].to_list()
paper_author = paper_author[paper_author['author_id'].isin(authors)].reset_index(drop=True)
valid_papers = paper_author['paper_id'].unique()
print('Number of papers :', len(valid_papers))

papers = papers[papers['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_conf = paper_conf[paper_conf['paper_id'].isin(valid_papers)].reset_index(drop=True)
print('Number of papers :', len(paper_conf))

paper_term = paper_term[paper_term['paper_id'].isin(valid_papers)].reset_index(drop=True)
valid_terms = paper_term['term_id'].unique()
terms = terms[terms['term_id'].isin(valid_terms)].reset_index(drop=True)


author_label = author_label.sort_values('author_id').reset_index(drop=True)
papers = papers.sort_values('paper_id').reset_index(drop=True)
terms = terms.sort_values('term_id').reset_index(drop=True)
confs = confs.sort_values('conf_id').reset_index(drop=True)

print('Number of conferences ', len(confs))
print('Number of authors ', len(author_label))
print('Number of terms ', len(terms))
print('Number of papers ', len(papers))

authors_list = list(author_label['author_id'])
papers_list = list(papers['paper_id'])
term_list = list(terms['term_id'])
conf_list = list(confs['conf_id'])
dim = len(authors_list) + len(papers_list) + len(term_list) + len(confs)
print(' Total entities :: ', dim)


author_id_mapping = {row['author_id']: i for i, row in author_label.iterrows()}
paper_id_mapping = {row['paper_id']: i + len(author_label) for i, row in papers.iterrows()}
term_id_mapping = {row['term_id']: i + len(author_label) + len(papers) for i, row in terms.iterrows()}
conf_id_mapping = {row['conf_id']: i + len(author_label) + len(papers) + len(terms) for i, row in confs.iterrows()}


entity_id_map = pd.DataFrame(
    columns=['domain', 'entity_id','serial_id']
)
type_dict = { 'author': author_id_mapping, 'paper': paper_id_mapping, 'term': term_id_mapping, 'conf': conf_id_mapping }
for _type,_dict in type_dict.items():
    i = list(_dict.keys())
    j = list(_dict.values())
    _df = pd.DataFrame( data = {'entity_id': i ,'serial_id': j } )
    _df['domain'] = _type
    entity_id_map = entity_id_map.append(_df, ignore_index=True)

In [None]:
# ======================================================
# Save data
# ======================================================
data_save_path = 'processed_data/'
if not os.path.exists('processed_data'):
    os.mkdir('processed_data')
if not os.path.exists(data_save_path):
    os.mkdir(data_save_path)
entity_id_map.to_csv( os.path.join( data_save_path, 'entity_id_mapping.csv')) 

In [None]:
# Create graph data
nodes_author_df = pd.DataFrame({'feature1': [0]}, index=list(author_id_mapping.values()))
nodes_paper_df = pd.DataFrame([np.asarray(row['features']).flatten() for i, row in papers.iterrows()], index=list(paper_id_mapping.values()))
nodes_term_df = pd.DataFrame([np.asarray(row['features']).flatten() for i, row in terms.iterrows()], index=list(term_id_mapping.values()))
nodes_conf_df = pd.DataFrame([np.asarray(row['features']).flatten() for i, row in confs.iterrows()], index=list(conf_id_mapping.values()))

In [None]:
nodes_author_df.to_csv(os.path.join(data_save_path,'nodes_author.csv'),index = False)
nodes_paper_df.to_csv(os.path.join(data_save_path,'nodes_paper.csv'),index = False)
nodes_term_df.to_csv(os.path.join(data_save_path,'nodes_term.csv'),index = False)
nodes_conf_df.to_csv(os.path.join(data_save_path,'nodes_conf.csv'),index = False)

In [None]:
PA_edge_list = []
for _, row in paper_author.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = author_id_mapping[row['author_id']]
    PA_edge_list.append((idx1,idx2))
    
df = pd.DataFrame ( data =  np.array(PA_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PA_edges.csv')
df.to_csv(fpath, index=False)
    
PT_edge_list = []
for _, row in paper_term.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = term_id_mapping[row['term_id']]
    PT_edge_list.append((idx1,idx2))

df = pd.DataFrame ( data =  np.array(PT_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PT_edges.csv')
df.to_csv(fpath, index=False)
    

PC_edge_list = []
for _, row in paper_conf.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = conf_id_mapping[row['conf_id']]
    PC_edge_list.append((idx1,idx2))

df = pd.DataFrame ( data = np.array(PC_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PC_edges.csv')
df.to_csv(fpath, index=False)

In [None]:
################## Saving Expert Node Features #################
te = np.zeros((2076,2076))
for i in range(len(te)):
    for j in range(len(te[i])):
        if(i==j):
            te[i][j]=1

df_te = pd.DataFrame(te)
df_te.to_csv("Author_IdentityMatrix_Features.csv")

In [None]:
################## Saving Expert Node Features #################
te = np.zeros((2470,2470))
for i in range(len(te)):
    for j in range(len(te[i])):
        if(i==j):
            te[i][j]=1

df_te = pd.DataFrame(te)
df_te.to_csv("Author_IdentityMatrix_Features.csv")

In [None]:
src_dir = './processed_data/'

nodes_DBLP_df = pd.read_csv(
    os.path.join(
        src_dir,
        'nodes_DBLP.csv'),
    index_col = 0
)


fpath_list = ['nodes_DBLP_edges_withRelations.csv']
df_edges = None
for fpath in fpath_list:
    _df = pd.read_csv( os.path.join(src_dir,fpath), index_col = None )
    if df_edges is None : df_edges = _df
    else:
        df_edges = df_edges.append(_df,ignore_index= True)

In [None]:
graph_obj = StellarGraph({
    "DBLP": nodes_DBLP_df
},
    df_edges, edge_type_column="orientation"
)

In [None]:
print(graph_obj.info())