In [80]:
%matplotlib notebook
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Load candidate edges scores.

In [81]:
candEdgesScores = pd.read_csv("edgesScoresRandom.txt", sep= '\t', usecols= [0,1,2,7,8],
                              names= ['source', 'target', 'node2vecScore', 'gainScore', 'expGainScore'], header = 0)
#candEdgesScores.head()

In [82]:
candEdgesDist = pd.read_csv("edgesDistancesRandomSources.txt", sep= "\t")
#candEdgesDist.head()

In [83]:
candEdgesScores = candEdgesScores.join(candEdgesDist.set_index(['source', 'target']), on= ['source', 'target'])
#candEdgesScores.head()

### Describe candidate edges.

In [115]:
#candEdgesScores.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

In [85]:
#candEdgesScores.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['distance']

### Load nodes' quality characteristics.

In [86]:
nodes = pd.read_csv('nodeQualityFeatures.txt', sep= '\t')
#nodes.head()

### Describe nodes.

In [109]:
#nodes.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

### Load selected targets for node2vec, maxGain, maxExpGain.

In [88]:
policies = ['Node2vec', "Gain", 'ExpGain', 'Random']
selectedEdges = dict()
for policy in policies:
    # Read selected edges scores, distances and join tables.
    eScores = pd.read_csv("edgesSelectedBy%sRandomSources.txt" %policy, sep= '\t', usecols= [0,1,2,7,8],
                          names= ['source', 'target', 'node2vecScore', 'gainScore', 'expGainScore'], header = 0 )
    eDist = pd.read_csv("selectedEdgesBy%sRandomSourcesDistances.txt" %policy, sep= '\t')
    selectedEdges[policy] = eScores.join(eDist.set_index(['source', 'target']), on= ['source', 'target'])

### Drop outliers.

#### For node2vec

In [90]:
temp = selectedEdges['Node2vec']['target'].value_counts()
temp1 = temp.to_numpy()
plt.axhline(y= 100, color= 'r', linestyle= '--', alpha= 0.4, linewidth= 0.6)
plt.plot(np.arange(temp1.size),temp1)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x20874348160>]

In [91]:
selectedEdges['Node2vec'] = selectedEdges['Node2vec'][selectedEdges['Node2vec']['target'].isin(temp[temp > 100].index[:] ) ]

#### For maximum gain.

In [92]:
temp = selectedEdges['Gain']['target'].value_counts()
temp1 = temp.to_numpy()
plt.axhline(y= 100, color= 'r', linestyle= '--', alpha= 0.4, linewidth= 0.6)
plt.plot(np.arange(temp1.size),temp1)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x208743a9bb0>]

In [93]:
selectedEdges['Gain'] = selectedEdges['Gain'][selectedEdges['Gain']['target'].isin(temp[temp > 100].index[:] ) ]

#### For maximum expected gain.

In [94]:
temp = selectedEdges['ExpGain']['target'].value_counts()
temp1 = temp.to_numpy()
plt.axhline(y= 10, color= 'r', linestyle= '--', alpha= 0.4, linewidth= 0.6)
plt.plot(np.arange(temp1.size),temp1)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x2087440b520>]

In [95]:
selectedEdges['ExpGain'] = selectedEdges['ExpGain'][selectedEdges['ExpGain']['target'].isin(temp[temp > 10].index[:] ) ]

### Describe selected edges.

#### Node2vec.

In [96]:
#selectedEdges['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

#### Gain.

In [97]:
#selectedEdges['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

#### Expected gain.

In [98]:
#selectedEdges['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

#### Random.

In [99]:
#selectedEdges['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

### Describe target nodes.

In [100]:
targetNodes = dict()
for policy in policies:
    targetNodes[policy] = nodes[nodes['nodeId'].isin(selectedEdges[policy]['target']) ]

#### Node2vec.

In [101]:
#targetNodes['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

#### Gain.

In [102]:
#targetNodes['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

#### Expected Gain.

In [103]:
#targetNodes['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

In [104]:
#targetNodes['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

### Results.

In [111]:
print("----------------Distances---------------:")
edgeDistances = {'candidate': candEdgesScores.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['distance'],
      'Random': selectedEdges['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['distance'],
      'Node2vec': selectedEdges['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['distance'],
      'Gain': selectedEdges['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['distance'],
      'ExpGain': selectedEdges['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['distance']
      }
edgeDistances = pd.DataFrame(edgeDistances)
edgeDistances

----------------Distances---------------:


Unnamed: 0,candidate,Random,Node2vec,Gain,ExpGain
count,147495.0,1220.0,1140.0,1192.0,1129.0
mean,2.81735,2.809016,2.313158,3.745805,2.644818
std,0.761287,0.814401,0.62097,0.940719,0.726019
min,1.0,1.0,1.0,2.0,1.0
10%,2.0,2.0,2.0,3.0,2.0
20%,2.0,2.0,2.0,3.0,2.0
30%,2.0,2.0,2.0,3.0,2.0
40%,3.0,3.0,2.0,3.0,2.0
50%,3.0,3.0,2.0,4.0,3.0
60%,3.0,3.0,2.0,4.0,3.0


In [114]:
print("----------------Gain Score---------------:")

edgeGain = {'candidate': candEdgesScores.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['gainScore'],
      'Random': selectedEdges['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['gainScore'],
      'Node2vec': selectedEdges['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['gainScore'],
      'Gain': selectedEdges['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['gainScore'],
      'ExpGain': selectedEdges['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['gainScore']
      }
edgeGain = pd.DataFrame(edgeGain)
edgeGain

----------------Gain Score---------------:


Unnamed: 0,candidate,Random,Node2vec,Gain,ExpGain
count,147495.0,1220.0,1140.0,1192.0,1129.0
mean,2.9e-05,2.4e-05,1.9e-05,0.00045,0.000352
std,0.00055,0.000545,0.00046,0.000869,0.000661
min,-0.006126,-0.003618,-0.003217,2e-06,1e-06
10%,-0.000237,-0.000236,-0.000154,2.3e-05,1.9e-05
20%,-7.1e-05,-6.6e-05,-4.8e-05,4e-05,3.2e-05
30%,-2.8e-05,-2.7e-05,-2.4e-05,6.6e-05,4.8e-05
40%,-6e-06,-5e-06,-7e-06,8.4e-05,7.2e-05
50%,2e-06,2e-06,2e-06,0.000202,0.000138
60%,2e-05,1.8e-05,1.5e-05,0.000313,0.000236


In [120]:
print("----------------In Degree---------------:")

edgeInDegrees = {
    'Candidate': nodes.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['inDegree'],
    'Random': targetNodes['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['inDegree'],
    'Node2vec': targetNodes['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['inDegree'],
    'Gain': targetNodes['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['inDegree'],
    'ExpGain': targetNodes['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['inDegree']
}
edgeInDegrees = pd.DataFrame(edgeInDegrees)
edgeInDegrees

----------------In Degree---------------:


Unnamed: 0,Candidate,Random,Node2vec,Gain,ExpGain
count,1222.0,765.0,10.0,10.0,18.0
mean,13.680033,13.588235,88.7,1.7,35.222222
std,22.308387,21.517717,59.159014,4.029061,31.266512
min,0.0,0.0,0.0,0.0,0.0
10%,0.0,0.0,38.7,0.0,0.0
20%,1.0,1.0,48.6,0.0,0.8
30%,2.0,2.0,61.2,0.0,5.8
40%,3.0,3.0,73.8,0.0,17.0
50%,5.0,5.0,83.5,0.0,28.5
60%,8.0,9.0,90.4,0.4,56.0


In [124]:
print("----------------Out Degree---------------:")

edgeOutDegrees = {
    'Candidate': nodes.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['outDegree'],
    'Random': targetNodes['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['outDegree'],
    'Node2vec': targetNodes['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['outDegree'],
    'Gain': targetNodes['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['outDegree'],
    'ExpGain': targetNodes['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['outDegree']
}
edgeOutDegrees = pd.DataFrame(edgeOutDegrees)
edgeOutDegrees

----------------Out Degree---------------:


Unnamed: 0,Candidate,Random,Node2vec,Gain,ExpGain
count,1222.0,765.0,10.0,10.0,18.0
mean,13.680033,14.184314,76.9,1.3,41.222222
std,23.916828,26.18013,74.640546,0.483046,69.930685
min,0.0,0.0,8.0,1.0,1.0
10%,0.0,0.0,23.3,1.0,1.0
20%,1.0,1.0,25.8,1.0,1.0
30%,1.0,1.0,33.7,1.0,2.0
40%,2.0,2.0,44.8,1.0,2.0
50%,4.0,4.0,60.0,1.0,17.5
60%,7.0,7.4,70.4,1.0,22.0


In [121]:
print("----------------Pagerank---------------:")

targetPagerank = {
    'Candidate': nodes.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['pagerank'],
    'Random': targetNodes['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['pagerank'],
    'Node2vec': targetNodes['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['pagerank'],
    'Gain': targetNodes['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['pagerank'],
    'ExpGain': targetNodes['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['pagerank']
}
targetPagerank = pd.DataFrame(targetPagerank)
targetPagerank

----------------Pagerank---------------:


Unnamed: 0,Candidate,Random,Node2vec,Gain,ExpGain
count,1222.0,765.0,10.0,10.0,18.0
mean,0.000818,0.000791,0.004645,0.000284,0.001404
std,0.002225,0.002062,0.002701,0.000106,0.001404
min,0.000243,0.000243,0.000243,0.000243,0.000243
10%,0.000243,0.000243,0.001864,0.000243,0.000243
20%,0.00025,0.00025,0.002491,0.000243,0.00025
30%,0.000265,0.000266,0.003569,0.000243,0.000534
40%,0.000289,0.000292,0.004383,0.000243,0.000838
50%,0.000331,0.000331,0.004722,0.000243,0.001271
60%,0.000404,0.000402,0.005143,0.000249,0.001379


In [122]:
print("----------------Red Pagerank---------------:")

targetRedPagerank = {
    'Candidate': nodes.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redPagerank'],
    'Random': targetNodes['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redPagerank'],
    'Node2vec': targetNodes['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redPagerank'],
    'Gain': targetNodes['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redPagerank'],
    'ExpGain': targetNodes['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redPagerank']
}
targetRedPagerank = pd.DataFrame(targetRedPagerank)
targetRedPagerank

----------------Red Pagerank---------------:


Unnamed: 0,Candidate,Random,Node2vec,Gain,ExpGain
count,1222.0,765.0,10.0,10.0,18.0
mean,0.332798,0.328954,0.323023,0.622608,0.561188
std,0.164824,0.164405,0.14048,0.00646,0.047647
min,0.0,0.0,0.161032,0.615104,0.490224
10%,0.149289,0.149344,0.163151,0.618413,0.51174
20%,0.158458,0.158405,0.169762,0.619517,0.519254
30%,0.172811,0.172847,0.263759,0.619806,0.52413
40%,0.204379,0.204379,0.305237,0.620531,0.530376
50%,0.282878,0.282878,0.313908,0.620985,0.555863
60%,0.432878,0.423129,0.323959,0.621168,0.570028


In [123]:
print("----------------Red Neighbors Out Ratio---------------:")

targetRedOutRatio = {
    'Candidate': nodes.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsOutRatio'],
    'Random': targetNodes['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsOutRatio'],
    'Node2vec': targetNodes['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsOutRatio'],
    'Gain': targetNodes['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsOutRatio'],
    'ExpGain': targetNodes['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsOutRatio']
}
targetRedOutRatio = pd.DataFrame(targetRedOutRatio)
targetRedOutRatio

----------------Red Neighbors Out Ratio---------------:


Unnamed: 0,Candidate,Random,Node2vec,Gain,ExpGain
count,1222.0,765.0,10.0,10.0,18.0
mean,0.432953,0.427127,0.277833,1.0,0.952916
std,0.419469,0.418462,0.378486,0.0,0.057653
min,0.0,0.0,0.0,1.0,0.834951
10%,0.0,0.0,0.0,1.0,0.857509
20%,0.0,0.0,0.003101,1.0,0.909231
30%,0.0,0.0,0.02021,1.0,0.938988
40%,0.02451,0.020343,0.034261,1.0,0.956191
50%,0.5,0.5,0.09948,1.0,0.970612
60%,0.5,0.5,0.171676,1.0,1.0


In [126]:
print("----------------Red Neighbors In Ratio---------------:")

targetRedInRatio = {
    'Candidate': nodes.describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsInRatio'],
    'Random': targetNodes['Random'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsInRatio'],
    'Node2vec': targetNodes['Node2vec'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsInRatio'],
    'Gain': targetNodes['Gain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsInRatio'],
    'ExpGain': targetNodes['ExpGain'].describe(percentiles= [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])['redNeighborsInRatio']
}
targetRedInRatio = pd.DataFrame(targetRedInRatio)
targetRedInRatio

----------------Red Neighbors In Ratio---------------:


Unnamed: 0,Candidate,Random,Node2vec,Gain,ExpGain
count,1222.0,765.0,10.0,10.0,18.0
mean,0.518632,0.510423,0.709222,0.7,0.883832
std,0.425197,0.427298,0.338252,0.258199,0.211346
min,0.0,0.0,0.069767,0.5,0.5
10%,0.0,0.0,0.297602,0.5,0.5
20%,0.0,0.0,0.448583,0.5,0.687097
30%,0.094277,0.077225,0.494,0.5,0.975364
40%,0.333333,0.285714,0.7,0.5,0.984356
50%,0.5,0.5,0.899621,0.5,1.0
60%,0.746774,0.666667,0.967641,0.7,1.0


### Unique targets by policy.

In [135]:
policiesB = ['Node2vec', 'ResourceAllocation', 'PreferencialAttachment', 'JaccardCoefficient', 'Gain', 'ExpGain', 
           'AdamicAdar', 'Random']
selEdgesB = dict()
uniqueNodes = dict()
for policy in policiesB:
    selEdgesB[policy] = pd.read_csv("selectedEdgesBy%sRandomSourcesDistances.txt" %policy, sep= '\t')
    uniqueNodes[policy] = selEdgesB[policy]['target'].nunique()
uniqueNodes

{'Node2vec': 15,
 'ResourceAllocation': 351,
 'PreferencialAttachment': 17,
 'JaccardCoefficient': 662,
 'Gain': 20,
 'ExpGain': 40,
 'AdamicAdar': 315,
 'Random': 765}