In [1]:
import numpy as np
from scipy.sparse import csr_matrix
import json

In [7]:
def definition(inputFile):
    hashMap = {}
    with open(inputFile) as f:
        content = f.read().splitlines()

    for i in xrange(len(content)):
        (index, definition) = content[i].split('\t')
        hashMap[int(index)] = definition
        
    return (hashMap, list(sorted(hashMap.keys())))

In [8]:
(authors, aIndices) = definition('original/author.txt')
(papers, pIndices) = definition('original/paper.txt')
(terms, tIndices) = definition('original/term.txt')
(venues, vIndices) = definition('original/venue.txt')

In [11]:
def formMatrices(inputFile):
    # need to create M_AP, M_PV, M_PT
    ap_row = []
    ap_col = []
    pv_row = []
    pv_col = []
    pt_row = []
    pt_col = []
    
    a2p = {}
    p2a = {}
    p2v = {}
    v2p = {}
    p2t = {}
    t2p = {}
    
    with open(inputFile) as f:
        content = f.read().splitlines()

    for i in xrange(len(content)):
        (paper, item, _) = content[i].split('\t')
        paper = int(paper)
        item = int(item)
        
        if (item <= 13575): # this is a term => update M_PT
            pt_row.append(pIndices.index(paper))
            pt_col.append(tIndices.index(item))
            
            if paper in p2t:
                p2t[paper].append(item)
            else:
                p2t[paper] = [item]
                
            if item in t2p:
                t2p[item].append(paper)
            else:
                t2p[item] = [paper]
                
        elif (42145 <= item and item <= 42164): # venue => update M_PV
            pv_row.append(pIndices.index(paper))
            pv_col.append(vIndices.index(item))
            
            if paper in p2v:
                p2v[paper].append(item)
            else:
                p2v[paper] = [item]
                
            if item in v2p:
                v2p[item].append(paper)
            else:
                v2p[item] = [paper]
            
        else: # author => update M_AP
            ap_row.append(aIndices.index(item))
            ap_col.append(pIndices.index(paper))
            
            if paper in p2a:
                p2a[paper].append(item)
            else:
                p2a[paper] = [item]
                
            if item in a2p:
                a2p[item].append(paper)
            else:
                a2p[item] = [paper]

    AP = csr_matrix((np.ones(len(ap_row)), (ap_row, ap_col)), dtype=np.float64)
    PT = csr_matrix((np.ones(len(pt_row)), (pt_row, pt_col)), dtype=np.float64)
    PV = csr_matrix((np.ones(len(pv_row)), (pv_row, pv_col)), dtype=np.float64)
    
    return (AP*np.transpose(AP),
            AP*PT*np.transpose(PT)*np.transpose(AP),
            AP*PV*np.transpose(PV)*np.transpose(AP),
           a2p, p2a, p2v, v2p, p2t, t2p)

In [12]:
(APA, APTPA, APVPA, A2P, P2A, P2V, V2P, P2T, T2P) = formMatrices('original/relation.txt')

In [9]:
def write2file(name, matrix):
    with open(name + '.txt', 'w') as f:
        (row, col) = matrix.nonzero()
        for i in xrange(len(row)):
            f.write(row[i].__str__() + '\t' + col[i].__str__() + '\t' + matrix[row[i], col[i]].__str__() + '\n')

In [46]:
write2file('APTPA_', APTPAhalf)

In [47]:
write2file('APVPA_', APVPAhalf)

In [45]:
write2file('APA_', APAhalf)

In [22]:
APTPArows = []
APTPAcols = []
APTPAvals = []
(rows, cols) = APTPA.nonzero()

for i in xrange(len(rows)):
    if rows[i] <= cols[i]:
        APTPArows.append(rows[i])
        APTPAcols.append(cols[i])
        APTPAvals.append(APTPA[rows[i], cols[i]])
        
APTPAhalf = csr_matrix((APTPAvals, (APTPArows, APTPAcols)))

In [20]:
APArows = []
APAcols = []
APAvals = []
(rows, cols) = APA.nonzero()

for i in xrange(len(rows)):
    if rows[i] >= cols[i]:
        APArows.append(rows[i])
        APAcols.append(cols[i])
        APAvals.append(APA[rows[i], cols[i]])
        
APAhalf = csr_matrix((APAvals, (APArows, APAcols)))

In [29]:
APVPArows = []
APVPAcols = []
APVPAvals = []
(rows, cols) = APVPA.nonzero()

for i in xrange(len(rows)):
    if rows[i] <= cols[i]:
        APVPArows.append(rows[i])
        APVPAcols.append(cols[i])
        APVPAvals.append(APVPA[rows[i], cols[i]])
        
APVPAhalf = csr_matrix((APVPAvals, (APVPArows, APVPAcols)))

In [30]:
APVPA

<5000x5000 sparse matrix of type '<type 'numpy.float64'>'
	with 11243494 stored elements in Compressed Sparse Row format>

In [31]:
APVPAhalf

<5000x5000 sparse matrix of type '<type 'numpy.float64'>'
	with 5624247 stored elements in Compressed Sparse Row format>

In [26]:
json.dump(aIndices, open('aIndices.json', 'w'))
json.dump(authors, open('authors.json', 'w'))
json.dump(pIndices, open('pIndices.json', 'w'))
json.dump(papers, open('papers.json', 'w'))
json.dump(tIndices, open('tIndices.json', 'w'))
json.dump(terms, open('terms.json', 'w'))
json.dump(vIndices, open('vIndices.json', 'w'))
json.dump(venues, open('venues.json', 'w'))

In [27]:
json.dump(A2P, open('A2P.json', 'w'))
json.dump(P2A, open('P2A.json', 'w'))
json.dump(P2V, open('P2V.json', 'w'))
json.dump(V2P, open('V2P.json', 'w'))
json.dump(P2T, open('P2T.json', 'w'))
json.dump(T2P, open('T2P.json', 'w'))