In [32]:
import scipy.io as sio
import numpy as np
from scipy.sparse import csr_matrix

In [33]:
ACM = sio.loadmat('../raw/ACM/ACM.mat')

In [34]:
paper_conf = ACM['PvsC'].nonzero()[1]

In [35]:
# DataBase
paper_db = np.isin(paper_conf,[1,13])  #SIGMOD、VLDB
paper_db_idx = np.where(paper_db == True)[0]
# Data Mining
paper_dm = np.isin(paper_conf,[0]) #KDD
paper_dm_idx = np.where(paper_dm == True)[0]
# Wireless Communicatsion
paper_wc = np.isin(paper_conf,[9,10]) #SIGCOMM、MobiCOMM
paper_wc_idx = np.where(paper_wc == True)[0]

In [36]:
paper_idx = np.sort(list(paper_db_idx)+list(paper_dm_idx)+list(paper_wc_idx))

In [37]:
# 0 : database, 1: wireless communicatsion, 2: data mining
paper_target = []
for idx in paper_idx:
    if idx in paper_db_idx:
        paper_target.append(0)
    elif idx in paper_wc_idx:
        paper_target.append(1)
    else:
        paper_target.append(2)
paper_target = np.array(paper_target)

## Edges and Features

In [38]:
authorslen = ACM['PvsA'][paper_idx].nonzero()[1]
author_dic = dict()
all_authors = list()
for author in authorslen:
    if author not in author_dic:
        author_dic[author] = len(author_dic) + len(paper_idx)
    all_authors.append(author_dic[author])
all_authors = np.array(all_authors)

In [39]:
subjectslen = ACM['PvsL'][paper_idx].nonzero()[1]
subject_dic = dict()
all_subjects = list()
for subject in subjectslen:
    if subject not in subject_dic:
        subject_dic[subject] = len(subject_dic) + len(paper_idx) + len(author_dic)
    all_subjects.append(subject_dic[subject])
all_subjects = np.array(all_subjects)

In [40]:
termslen = ACM['PvsT'][paper_idx].nonzero()[1]
term_dic = dict()
all_items = list()
for term in termslen:
    if term not in term_dic:
        term_dic[term] = len(term_dic) + len(paper_idx) + len(author_dic) + len(subject_dic)
    all_items.append(term_dic[term])
all_items = np.array(all_items)

In [41]:
num_node = len(paper_idx) + len(author_dic) + len(subject_dic) + len(term_dic)

papers = ACM['PvsA'][paper_idx].nonzero()[0]
data = np.ones_like(papers)
A_pa = csr_matrix((data, (papers, all_authors)), shape=(num_node,num_node))

papers = ACM['PvsL'][paper_idx].nonzero()[0]
data = np.ones_like(papers)
A_ps = csr_matrix((data, (papers, all_subjects)), shape=(num_node,num_node))

papers = ACM['PvsT'][paper_idx].nonzero()[0]
data = np.ones_like(papers)
A_pt = csr_matrix((data, (papers, all_items)), shape=(num_node,num_node))

A_ap = A_pa.transpose()
A_sp = A_ps.transpose()

In [42]:
# >0 only once appear
paper_feat = np.array(A_pt[:len(paper_idx),-len(term_dic):].toarray()>0, dtype=np.int)
author_feat = np.array(A_pa.transpose().dot(A_pt)[len(paper_idx):len(paper_idx)+len(author_dic),-len(term_dic):].toarray()>0, dtype=np.int)
subject_feat = np.array(A_ps.transpose().dot(A_pt)[len(paper_idx)+len(author_dic):len(paper_idx)+len(author_dic)+len(subject_dic),-len(term_dic):].toarray()>0, dtype=np.int)


In [43]:
# Paper 的 Train, Valid 划分
np.random.seed(20210521)
train_valid_paper_DB = list(np.random.choice(np.where(paper_target==0)[0],300, replace=False))
train_valid_paper_WC = list(np.random.choice(np.where(paper_target==1)[0],300, replace=False))
train_valid_paper_DM = list(np.random.choice(np.where(paper_target==2)[0],300, replace=False))

train_paper_idx = np.array(train_valid_paper_DB[:150] + train_valid_paper_WC[:150] + train_valid_paper_DM[:150])
train_paper_idx.sort()

valid_paper_idx = np.array(train_valid_paper_DB[150:] + train_valid_paper_WC[150:] + train_valid_paper_DM[150:])
valid_paper_idx.sort()

test_paper_idx = np.array(list((set(np.arange(paper_target.shape[0])) - set(train_paper_idx)) - set(valid_paper_idx)))
test_paper_idx.sort()

In [44]:
from sklearn.preprocessing import OneHotEncoder

In [45]:
onehot_encoder = OneHotEncoder()
onehot_paper_label = onehot_encoder.fit_transform(paper_target.reshape(len(paper_target), 1))

In [46]:
M_ap = A_ap[len(paper_idx):len(paper_idx)+len(author_dic), :len(paper_idx)]
M_pa = A_pa[:len(paper_idx), len(paper_idx):len(paper_idx)+len(author_dic)]
M_ps = A_ps[:len(paper_idx), len(paper_idx)+len(author_dic):len(paper_idx)+len(author_dic)+len(subject_dic)]

In [31]:
M_pap = np.array(A_pa.dot(A_ap).toarray()>0, dtype=np.int)
M_apa = np.array(A_ap.dot(A_pa).toarray()>0, dtype=np.int)

In [29]:
M_sps = np.array(A_sp.dot(A_ps).toarray()>0, dtype=np.int)
M_psp = np.array(A_ps.dot(A_sp).toarray()>0, dtype=np.int)

In [None]:
train_paper_idx = train_paper_idx.astype(np.int)
valid_paper_idx = valid_paper_idx.astype(np.int)
test_paper_idx = test_paper_idx.astype(np.int)
onehot_paper_label = onehot_paper_label.astype(np.int)

In [18]:
acm = dict()
acm['paper_feature'] = csr_matrix(paper_feat)
acm['author_feature'] = csr_matrix(author_feat)
acm['subject_feature'] = csr_matrix(subject_feat)
acm['PA'] = M_pa
acm['PS'] = M_ps
acm['PAP'] = csr_matrix(M_pap[:len(paper_idx), :len(paper_idx)])
acm['APA'] = csr_matrix(M_apa[len(paper_idx):len(paper_idx)+len(author_dic), len(paper_idx):len(paper_idx)+len(author_dic)])
acm['PSP'] = csr_matrix(M_psp[:len(paper_idx), :len(paper_idx)])
acm['SPS'] = csr_matrix(M_sps[-len(subject_dic)-len(term_dic):-len(term_dic),-len(subject_dic)-len(term_dic):-len(term_dic)])
acm['paper_label'] = onehot_paper_label
acm['train_paper_idx'] = train_paper_idx
acm['val_paper_idx'] = valid_paper_idx
acm['test_paper_idx'] = test_paper_idx

In [22]:
sio.savemat('new_acm.mat', acm)