In [None]:
# Import Standard Libraries

import csv
import pandas as pd
import numpy as np

from scipy.sparse import coo_matrix


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Read the raw data and preliminary checks

df = pd.read_csv('/content/drive/MyDrive/SAHA PROJECT - connectedpapers zip files/SimilarityCodes/1.RawData/sparse_mat.csv')
source = np.array(df.parent)
dest = np.array(df.child)
wt = np.array(df.ref_int)
is_base = np.array(df.base_flag)


# print(type(source))
# print(source[:15])
# print(dest[:15])
# print(wt[:15])
# print(is_base[:15])

# print(dest[0:3])

# print(dest[0])
# print(dest[1][0:2])
# print(dest[0][-2:])



In [None]:
# Create mappings (both ways) for all the strings in the 'parent' and 'child' columns

NodesWithDups = np.concatenate((source,dest), axis=0)
AllNodes = np.unique(NodesWithDups)

num_nodes = len(AllNodes)
print("Number of distinct nodes:", num_nodes)


StringToNode = dict([(y,x) for x,y in enumerate(sorted(set(AllNodes)))])
NodeToString = dict([(x,y) for x,y in enumerate(sorted(set(AllNodes)))])

print("\nChecking the mappings:")

print(StringToNode.get('000080c0dc8267f4b588d8c52c5157498e05f9de'))
print(StringToNode.get('6904f4bcd9572918f8c618c38f0915f135322d9e'))
print(StringToNode.get('c5e2d66939c2df6e4ef7f2bec4d9879e9b2151b2'))
print(StringToNode.get('1'))

print(NodeToString.get(0))
print(NodeToString.get(16844))
print(NodeToString.get(31306))
print(NodeToString.get(2329))

Number of distinct nodes: 40304

Checking the mappings:
0
16844
31306
2329
000080c0dc8267f4b588d8c52c5157498e05f9de
6904f4bcd9572918f8c618c38f0915f135322d9e
c5e2d66939c2df6e4ef7f2bec4d9879e9b2151b2
1


In [None]:
# Some snippets

rows = np.array(list([StringToNode[x] for x in source]))
cols = np.array(list([StringToNode[x] for x in dest]))
data = np.array(wt, dtype=np.float64)

print(rows[:10])
print(cols[:10])
print(data[:10])

[2329 2329 2329 2329 2329 2329 2329 2329 2329 2329]
[39134 19586  8425 29791 10553 20528  2464 31306 36467 13377]
[0.07692308 0.07692308 0.07692308 0.         0.07692308 0.15384615
 0.         0.         0.         0.07692308]


In [None]:
# Compute the similarities between base articles and its `derivatives of derivatives'

OneHopSim = coo_matrix((data, (rows, cols)), shape=(num_nodes, num_nodes))
TwoHopSim = OneHopSim*OneHopSim
Sims = OneHopSim + TwoHopSim

# rows, columns and similarities of the final sparse similarity matrix
row, col = Sims.nonzero()
wt_new = Sims.data

# obtaining string representation of nodes 
ancestor = np.array(list([NodeToString[x] for x in row]))
descendant = np.array(list([NodeToString[x] for x in col]))
similarity = np.array(wt_new, dtype=np.float64)

print(ancestor[:5])
print(descendant[:5])
print(similarity[:5], "\n")


# Saving the final similarities
Final = np.asarray([ancestor, descendant, similarity])
# print(Final)

FinalData = {'ancestor': ancestor, 'descendant': descendant, 'similarity': similarity}
df = pd.DataFrame(FinalData)

df.to_csv("../2.ProcessedData/Similarity.csv")

['000080c0dc8267f4b588d8c52c5157498e05f9de'
 '000080c0dc8267f4b588d8c52c5157498e05f9de'
 '000080c0dc8267f4b588d8c52c5157498e05f9de'
 '000080c0dc8267f4b588d8c52c5157498e05f9de'
 '000080c0dc8267f4b588d8c52c5157498e05f9de']
['1678789a14ef705a40950d3cb1ce060c4fba8cdb'
 '174f4d775bbe08dc3e61164605292c6c09bce0d4'
 '3e7b41d1a042c293f68192b6d149228415387722'
 '9d799c24bd6a0489c77fa4a1299baae79b8e21d8'
 'bf302b7a3c1ad43a4bc8e4d19724fc6b150b75d9']
[0.07220666 0.06169203 0.01398755 0.01131928 0.03479144] 



In [None]:
# Sanity check

# libraries
import csv
import pandas as pd
import numpy as np

from scipy.sparse import coo_matrix


# dummy string-node mapping
StringToNode = {'zero': 0, 'one': 1,'two': 2, 'three': 3, 'four': 4}
NodeToString = {0:'zero', 1: 'one', 2: 'two', 3: 'three', 4: 'four'}

# dummy data
source = np.array(['zero', 'one','two', 'three'])
dest = np.array(['one','two', 'three', 'four'])
wt = np.array([1., 2., 3., 4.], dtype=np.float64)
num_nodes = 5

# create sparse matrices for computation
rows = np.array(list([StringToNode[x] for x in source]))
cols = np.array(list([StringToNode[x] for x in dest]))
data = np.array(wt, dtype=np.float64)


# similarity computations
OneHopSim = coo_matrix((data, (rows, cols)), shape=(num_nodes, num_nodes))
TwoHopSim = OneHopSim*OneHopSim
Sims = OneHopSim + TwoHopSim

# rows, columns and similarities of the final sparse similarity matrix
row, col = Sims.nonzero()
wt_new = Sims.data

print(row)
print(col)
print(Sims.data,"\n")
print(Sims.toarray(),"\n")


# obtaining string representation of nodes 
ancestor = np.array(list([NodeToString[x] for x in row]))
descendant = np.array(list([NodeToString[x] for x in col]))
similarity = np.array(wt_new, dtype=np.float64)

print(ancestor)
print(descendant)
print(similarity, "\n")


# Saving the final similarities
Final = np.asarray([ancestor, descendant, similarity])
print(Final,"\n")

FinalData = {'ancestor': ancestor, 'descendant': descendant, 'similarity': similarity}
df = pd.DataFrame(FinalData)
print(df)

df.to_csv("../2.ProcessedData/Similarity_dummy.csv")

[0 0 1 1 2 2 3]
[1 2 2 3 3 4 4]
[ 1.  2.  2.  6.  3. 12.  4.] 

[[ 0.  1.  2.  0.  0.]
 [ 0.  0.  2.  6.  0.]
 [ 0.  0.  0.  3. 12.]
 [ 0.  0.  0.  0.  4.]
 [ 0.  0.  0.  0.  0.]] 

['zero' 'zero' 'one' 'one' 'two' 'two' 'three']
['one' 'two' 'two' 'three' 'three' 'four' 'four']
[ 1.  2.  2.  6.  3. 12.  4.] 

[['zero' 'zero' 'one' 'one' 'two' 'two' 'three']
 ['one' 'two' 'two' 'three' 'three' 'four' 'four']
 ['1.0' '2.0' '2.0' '6.0' '3.0' '12.0' '4.0']] 

  ancestor descendant  similarity
0     zero        one         1.0
1     zero        two         2.0
2      one        two         2.0
3      one      three         6.0
4      two      three         3.0
5      two       four        12.0
6    three       four         4.0
