In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Red ratio by different policies - Random source nodes.

In [2]:
# Different policies.
policies = ['Node2vec', 'ResourceAllocation', 'PreferencialAttachment', 'JaccardCoefficient', 'Gain', 'ExpGain', 
           'AdamicAdar', 'Random']

In [3]:
# Different policies for average acceptance probability.
policiesB = ['Node2vec', 'Gain', 'ExpGain','Random']

In [4]:
# Load red ratio per epoch for each policy.
redRatio = dict()
for policy in policies:    
    redRatio[policy] = np.loadtxt('redRatioBy%sRandomSources.txt' %policy)

In [50]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle('Red Ratio By Policy - Random Source Nodes.', fontsize= 16)
plt.xlabel('Epoch', fontsize= 16)
plt.ylabel('Red Ratio', fontsize= 16)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
xAxis = np.arange(len(redRatio['Node2vec']) )
i = 1
for policy in policies:
    plt.plot(xAxis, redRatio[policy], "C%d" %i, label= policy, linewidth = 3.5)
    i += 1
plt.legend(fontsize=14, loc= 1)
plt.savefig("redRatioByPolicyRandomSourceNodesBlogs.pdf")
plt.savefig("redRatioByPolicyRandomSourceNodesBlogs.png")

<IPython.core.display.Javascript object>

### Red ratio by different policies - Best red source nodes.

In [51]:
# Load red ratio per epoch for each policy.
redRatio = dict()
for policy in policies:    
    redRatio[policy] = np.loadtxt('redRatioBy%sRedSources.txt' %policy)

In [52]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle('Red Ratio By Policy - Red Source Nodes.', fontsize= 16)
plt.xlabel('Epoch', fontsize= 16)
plt.ylabel('Red Ratio', fontsize= 16)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
xAxis = np.arange(len(redRatio['Node2vec']) )
i = 1
for policy in policies:
    plt.plot(xAxis, redRatio[policy], "C%d" %i, label= policy, linewidth = 3.5)
    i += 1
plt.legend(fontsize=14, loc= 1)
plt.savefig("redRatioByPolicyRedSourceNodesBlogs.pdf")
plt.savefig("redRatioByPolicyRedSourceNodesBlogs.png")

<IPython.core.display.Javascript object>

### Red ratio by different policies - Best blue source nodes.

In [53]:
# Load red ratio per epoch for each policy.
redRatio = dict()
for policy in policies:    
    redRatio[policy] = np.loadtxt('redRatioBy%sBlueSources.txt' %policy)

In [54]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle('Red Ratio By Policy - Blue Source Nodes.', fontsize= 16)
plt.xlabel('Epoch', fontsize= 16)
plt.ylabel('Red Ratio', fontsize= 16)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
xAxis = np.arange(len(redRatio['Node2vec']) )
i = 1
for policy in policies:
    plt.plot(xAxis, redRatio[policy], "C%d" %i, label= policy, linewidth = 3.5)
    i += 1
plt.legend(fontsize=14, loc= 1)
plt.savefig("redRatioByPolicyBlueSourceNodesBlogs.pdf")
plt.savefig("redRatioByPolicyBlueSourceNodesBlogs.png")

<IPython.core.display.Javascript object>

### Average acceptance probability (node2vec recommendation) by different policies - Random source nodes.

In [55]:
# Load red ratio per epoch for each policy.
redRatio = dict()
for policy in policiesB:    
    redRatio[policy] = np.loadtxt('node2vecBy%sRandomSources.txt' %policy)

In [56]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle('Average Acceptance Probability By Policy - Random Source Nodes.', fontsize= 16)
plt.xlabel('Epoch', fontsize= 16)
plt.ylabel('Acceptance Probability', fontsize= 16)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
xAxis = np.arange(len(redRatio['Node2vec']) )
i = 0
for policy in policiesB:
    plt.plot(xAxis, redRatio[policy], "C%d" %i, label= policy, linewidth = 3.5)
    i += 1
plt.legend(fontsize=14, loc= 1)
plt.savefig("averageAcceptanceProbabilityByPolicyRandomSourceNodesBlogs.pdf")
plt.savefig("averageAcceptanceProbabilityByPolicyRandomSourceNodesBlogs.png")

<IPython.core.display.Javascript object>

### Average acceptance probability (node2vec recommendation) by different policies - Best red source nodes.

In [57]:
# Load red ratio per epoch for each policy.
redRatio = dict()
for policy in policiesB:    
    redRatio[policy] = np.loadtxt('node2vecBy%sRedSources.txt' %policy)

In [58]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle('Average Acceptance Probability By Policy - Red Source Nodes.', fontsize= 16)
plt.xlabel('Epoch', fontsize= 16)
plt.ylabel('Acceptance Probability', fontsize= 16)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
xAxis = np.arange(len(redRatio['Node2vec']) )
i = 0
for policy in policiesB:
    plt.plot(xAxis, redRatio[policy], "C%d" %i, label= policy, linewidth = 3.5)
    i += 1
plt.legend(fontsize=14, loc= 1)
plt.savefig("averageAcceptanceProbabilityByPolicyRedSourceNodesBlogs.pdf")
plt.savefig("averageAcceptanceProbabilityByPolicyRedSourceNodesBlogs.png")

<IPython.core.display.Javascript object>

### Average acceptance probability (node2vec recommendation) by different policies - Best blue source nodes.

In [59]:
# Load red ratio per epoch for each policy.
redRatio = dict()
for policy in policiesB:    
    redRatio[policy] = np.loadtxt('node2vecBy%sBlueSources.txt' %policy)

In [60]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle('Average Acceptance Probability By Policy - Blue Source Nodes.', fontsize= 16)
plt.xlabel('Epoch', fontsize= 16)
plt.ylabel('Acceptance Probability', fontsize= 16)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
xAxis = np.arange(len(redRatio['Node2vec']) )
i = 0
for policy in policiesB:
    plt.plot(xAxis, redRatio[policy], "C%d" %i, label= policy, linewidth = 3.5)
    i += 1
plt.legend(fontsize=14, loc= 1)
plt.savefig("averageAcceptanceProbabilityByPolicyBlueSourceNodesBlogs.pdf")
plt.savefig("averageAcceptanceProbabilityByPolicyBlueSourceNodesBlogs.png")

<IPython.core.display.Javascript object>

### Final PageRank vs initial PageRank - Random source nodes.

In [87]:
# Load PageRanks.
initialPagerank = np.loadtxt('out_pagerank.txt', skiprows= 1, usecols= 1)
finalPagerank = dict()
for policy in policies:
    finalPagerank[policy] = np.loadtxt('finalPagerankBy%sRandomSources.txt' %policy, skiprows=1, usecols= 1)

In [88]:
# Load Communities.
tempCommunities = np.loadtxt('out_community.txt', skiprows= 1, dtype =int)
communities= np.zeros(tempCommunities[:,1].size, dtype= int)

for i in range(communities.size):
    node = tempCommunities[i][0]
    community = tempCommunities[i][1]
    communities[node] = community

In [89]:
# To calculate red ratio of network every 10 nodes.
def getRedRatio(pagerank, communities):
    index = np.argsort(-pagerank)
    redPagerank = 0.
    totalPagerank = 0.
    redRatio = 0.
    redRatios = []
    for i in range(pagerank.size):
        totalPagerank += pagerank[index[i] ]
        if communities[index[i] ]:
            redPagerank += pagerank[index[i] ]
        if (i + 1) % 10 == 0:
            redRatio = redPagerank / totalPagerank
            redRatios.append(redRatio)
    
    return redRatios

In [90]:
# Calculate red ratio per 10 nodes for all policies.
initialRedRatio = getRedRatio(initialPagerank, communities)
redRatios = dict()
for policy in policies:
    redRatio[policy] = getRedRatio(finalPagerank[policy], communities)

In [91]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle("Red PageRank by Order - 10 Nodes Check Points - Random Source Nodes.", fontsize= 16)
plt.xlabel("Nodes", fontsize= 16)
plt.ylabel("Red Ratio", fontsize= 16)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
xAxis= np.arange(1, len(initialRedRatio) + 1)
plt.plot(xAxis, initialRedRatio, "C0", label= 'Initial', linewidth = 3.5)
i = 1
for policy in policies:
    plt.plot(xAxis, redRatio[policy], "C%d" %i, label= policy, linewidth = 3.5)
    i += 1
plt.legend(fontsize=14, loc= 1)
plt.savefig("fairnessByPolicyRandomSourceNodesBlogs.pdf")
plt.savefig("fairnessByPolicyRandomSourceNodesBlogs.png")

<IPython.core.display.Javascript object>

### Final PageRank vs initial PageRank - Best red source nodes.

In [92]:
finalPagerank = dict()
for policy in policies:
    finalPagerank[policy] = np.loadtxt('finalPagerankBy%sRedSources.txt' %policy, skiprows=1, usecols= 1)

In [93]:
# Calculate red ratio per 10 nodes for all policies.
initialRedRatio = getRedRatio(initialPagerank, communities)
redRatios = dict()
for policy in policies:
    redRatio[policy] = getRedRatio(finalPagerank[policy], communities)

In [94]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle("Red PageRank by Order - 10 Nodes Check Points - Red Source Nodes.", fontsize= 16)
plt.xlabel("Nodes", fontsize= 16)
plt.ylabel("Red Ratio", fontsize= 16)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
xAxis= np.arange(1, len(initialRedRatio) + 1)
plt.plot(xAxis, initialRedRatio, "C0", label= 'Initial', linewidth = 3.5)
i = 1
for policy in policies:
    plt.plot(xAxis, redRatio[policy], "C%d" %i, label= policy, linewidth = 3.5)
    i += 1
plt.legend(fontsize=14, loc= 1)
plt.savefig("fairnessByPolicyRedSourceNodesBlogs.pdf")
plt.savefig("fairnessByPolicyRedSourceNodesBlogs.png")

<IPython.core.display.Javascript object>

### Final PageRank vs initial PageRank - Best blue source nodes.

In [95]:
finalPagerank = dict()
for policy in policies:
    finalPagerank[policy] = np.loadtxt('finalPagerankBy%sBlueSources.txt' %policy, skiprows=1, usecols= 1)

In [96]:
# Calculate red ratio per 10 nodes for all policies.
initialRedRatio = getRedRatio(initialPagerank, communities)
redRatios = dict()
for policy in policies:
    redRatio[policy] = getRedRatio(finalPagerank[policy], communities)

In [97]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle("Red PageRank by Order - 10 Nodes Check Points - Blue Source Nodes.", fontsize= 16)
plt.xlabel("Nodes", fontsize= 16)
plt.ylabel("Red Ratio", fontsize= 16)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
xAxis= np.arange(1, len(initialRedRatio) + 1)
plt.plot(xAxis, initialRedRatio, "C0", label= 'Initial', linewidth = 3.5)
i = 1
for policy in policies:
    plt.plot(xAxis, redRatio[policy], "C%d" %i, label= policy, linewidth = 3.5)
    i += 1
plt.legend(fontsize=14, loc= 1)
plt.savefig("fairnessByPolicyBlueSourceNodesBlogs.pdf")
plt.savefig("fairnessByPolicyBlueSourceNodesBlogs.png")

<IPython.core.display.Javascript object>

### Selected Edges Analysis.

#### Load scores and distances 

In [5]:
candEdgesScores = pd.read_csv("edgesScoresRandom.txt", sep= '\t')
candEdgesScores.head()

Unnamed: 0,sourceNode,targetNode,node2vecRecommendationScore,resourceAllocationScore,jaccardCoefficientScore,preferencialAttachmentScore,adamicAddarScore,gain,expectedGain
0,1204,0,0.986308,0.004484,0.025,390.0,0.18494,0.003571,0.003522
1,1204,1,0.305006,0.0,0.0,675.0,0.0,0.003609,0.001101
2,1204,2,0.322489,0.0,0.0,60.0,0.0,0.00135,0.000436
3,1204,3,0.356627,0.0,0.0,15.0,0.0,0.00221,0.000788
4,1204,4,0.325398,0.0,0.0,15.0,0.0,0.004001,0.001302


In [6]:
candEdgesDist = pd.read_csv("edgesDistancesRandomSources.txt", sep= "\t")
candEdgesDist.head()

Unnamed: 0,source,target,distance
0,1204,0,2
1,1204,1,3
2,1204,2,3
3,1204,3,3
4,1204,4,3


In [7]:
selEdgesScores = dict()
for policy in policiesB:
    selEdgesScores[policy] = pd.read_csv("edgesSelectedBy%sRandomSources.txt" %policy, sep='\t')
selEdgesDist = dict()
for policy in policiesB:
    selEdgesDist[policy] = pd.read_csv("selectedEdgesBy%sRandomSourcesDistances.txt" %policy, sep= '\t')

In [8]:
selEdgesScores['Node2vec'].head()

Unnamed: 0,Source,Target,node2vecScore,resAllocScore,jaccCoefScore,prefAttScore,adamicAdarScore,gain,expGain
0,1204,0,0.986308,0.004484,0.025,390,0.18494,0.003571,0.003522
1,1190,0,0.990956,0.0,0.0,286,0.0,0.00054,0.000535
2,701,0,0.802687,0.0,0.0,52,0.0,0.000283,0.000227
3,69,0,0.20696,0.0,0.0,26,0.0,0.000324,6.7e-05
4,1059,0,0.813536,0.0,0.0,104,0.0,0.000271,0.000221


In [9]:
selEdgesDist['Node2vec'].head()

Unnamed: 0,source,target,distance
0,1204,0,2
1,1204,357,2
2,1204,496,2
3,1204,565,2
4,1204,575,2


#### Candidate edges analysis.

In [10]:
candEdgesScores.describe()

Unnamed: 0,sourceNode,targetNode,node2vecRecommendationScore,resourceAllocationScore,jaccardCoefficientScore,preferencialAttachmentScore,adamicAddarScore,gain,expectedGain
count,147495.0,147495.0,147495.0,147495.0,147495.0,147495.0,147495.0,147495.0,147495.0
mean,613.177321,609.230652,0.344138,0.016098,0.023169,596.382406,0.299189,2.9e-05,1e-05
std,370.138296,352.77421,0.196897,0.06426,0.052818,1540.136639,0.859061,0.00055,0.000227
min,2.0,0.0,0.112102,0.0,0.0,1.0,0.0,-0.006126,-0.004277
25%,280.0,303.0,0.204226,0.0,0.0,23.0,0.0,-4.3e-05,-1.3e-05
50%,613.0,609.0,0.28316,0.0,0.0,98.0,0.0,2e-06,0.0
75%,922.0,915.0,0.435173,0.008,0.025641,465.0,0.212744,8.5e-05,2.3e-05
max,1215.0,1221.0,0.999986,4.991271,1.0,47736.0,35.425737,0.005702,0.003944


In [11]:
candEdgesDist.describe()

Unnamed: 0,source,target,distance
count,147495.0,147495.0,147495.0
mean,613.177321,609.230652,2.81735
std,370.138296,352.77421,0.761287
min,2.0,0.0,1.0
25%,280.0,303.0,2.0
50%,613.0,609.0,3.0
75%,922.0,915.0,3.0
max,1215.0,1221.0,7.0


In [12]:
candEdgesDist.boxplot(column=['distance'])

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x18f5ce74bb0>

#### Node2vec edge analysis.

In [13]:
selEdgesScores['Node2vec'].describe()

Unnamed: 0,Source,Target,node2vecScore,resAllocScore,jaccCoefScore,prefAttScore,adamicAdarScore,gain,expGain
count,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0
mean,612.065574,553.645902,0.733923,0.092886,0.024972,3913.698361,1.154076,1.5e-05,1.7e-05
std,369.986115,231.405041,0.229587,0.294619,0.042296,5989.174747,2.477526,0.000445,0.000389
min,2.0,0.0,0.154882,0.0,0.0,26.0,0.0,-0.003217,-0.002705
25%,280.0,496.0,0.639256,0.0,0.0,298.0,0.0,-4e-05,-2.5e-05
50%,616.0,581.0,0.76964,0.013514,0.008621,1317.0,0.270184,-1e-06,-1e-06
75%,922.0,671.0,0.927451,0.067095,0.028765,4998.5,1.185773,6e-05,4.2e-05
max,1215.0,1005.0,0.999986,4.99127,0.319783,47736.0,35.4257,0.004334,0.003944


In [14]:
selEdgesDist['Node2vec'].describe()

Unnamed: 0,source,target,distance
count,1220.0,1220.0,1220.0
mean,612.065574,553.645902,2.302459
std,369.986115,231.405041,0.616556
min,2.0,0.0,1.0
25%,280.0,496.0,2.0
50%,616.0,581.0,2.0
75%,922.0,671.0,3.0
max,1215.0,1005.0,4.0


In [15]:
# Keep most preferable targets.
topEdgesScores = dict()
topEdgesDist = dict()

In [16]:
temp = selEdgesDist['Node2vec']['target'].value_counts()
temp

0       122
781     119
581     118
618     117
575     116
767     114
357     113
565     109
671     106
496     106
889      34
1005     20
583      15
126       9
837       2
Name: target, dtype: int64

In [31]:
temp1 = temp.to_numpy()
temp1
fig = plt.figure()
plt.ylabel("#Occurencies")
plt.xlabel("Nodes Ordered by #Occurencies")
plt.axhline(y = 100, color = "r", linestyle= '--', alpha= 0.3)
plt.plot(np.arange(temp1.size), temp1, linewidth= 2.0 )

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x18f5d57c880>]

In [185]:
topEdgesScores['Node2vec'] = selEdgesScores['Node2vec'][selEdgesScores['Node2vec']['Target'].isin(temp[temp > 100].index[:])]
topEdgesScores['Node2vec'].head()

Unnamed: 0,Source,Target,node2vecScore,resAllocScore,jaccCoefScore,prefAttScore,adamicAdarScore,gain,expGain
0,1204,0,0.986308,0.004484,0.025,390,0.18494,0.003571,0.003522
1,1190,0,0.990956,0.0,0.0,286,0.0,0.00054,0.000535
2,701,0,0.802687,0.0,0.0,52,0.0,0.000283,0.000227
3,69,0,0.20696,0.0,0.0,26,0.0,0.000324,6.7e-05
4,1059,0,0.813536,0.0,0.0,104,0.0,0.000271,0.000221


In [186]:
topEdgesDist['Node2vec'] = selEdgesDist['Node2vec'][selEdgesDist['Node2vec']['target'].isin(temp[temp > 100].index[:])]
topEdgesDist['Node2vec'].head()

Unnamed: 0,source,target,distance
0,1204,0,2
1,1204,357,2
2,1204,496,2
3,1204,565,2
4,1204,575,2


In [188]:
topEdgesScores['Node2vec'].describe()

Unnamed: 0,Source,Target,node2vecScore,resAllocScore,jaccCoefScore,prefAttScore,adamicAdarScore,gain,expGain
count,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0
mean,628.237719,538.218421,0.737986,0.087853,0.02481,3617.453509,1.100576,1.9e-05,2e-05
std,369.336865,220.31679,0.227664,0.263727,0.041876,5620.230699,2.271609,0.00046,0.000402
min,2.0,0.0,0.155535,0.0,0.0,26.0,0.0,-0.003217,-0.002705
25%,318.0,496.0,0.642768,0.0,0.0,286.0,0.0,-3.7e-05,-2.3e-05
50%,639.0,581.0,0.775974,0.012658,0.008065,1215.0,0.249543,2e-06,1e-06
75%,926.75,671.0,0.929767,0.065879,0.028926,4515.0,1.143788,7.4e-05,5e-05
max,1215.0,781.0,0.999986,4.99127,0.296774,40936.0,28.7183,0.004334,0.003944


In [189]:
topEdgesDist['Node2vec'].describe()

Unnamed: 0,source,target,distance
count,1140.0,1140.0,1140.0
mean,628.237719,538.218421,2.313158
std,369.336865,220.31679,0.62097
min,2.0,0.0,1.0
25%,318.0,496.0,2.0
50%,639.0,581.0,2.0
75%,926.75,671.0,3.0
max,1215.0,781.0,4.0


In [191]:
topEdgesDist['Node2vec']['distance'].value_counts()

2    679
3    364
1     67
4     30
Name: distance, dtype: int64

In [192]:
topEdgesDist['Node2vec']['distance'].describe()

count    1140.000000
mean        2.313158
std         0.620970
min         1.000000
25%         2.000000
50%         2.000000
75%         3.000000
max         4.000000
Name: distance, dtype: float64

In [201]:
topEdgesDist['Node2vec'].boxplot(column=['distance'])

<IPython.core.display.Javascript object>

In [209]:
topN2V = topEdgesDist['Node2vec']['distance'].to_numpy()

In [210]:
cand = candEdgesDist['distance'].to_numpy()

In [246]:
fig = plt.figure()
boxprops = dict(linestyle='-', linewidth=2)
meanpointprops = dict(marker='D', markeredgecolor='black')
medianprops = dict(linestyle='-', linewidth=2.5)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
fig.suptitle("Node2vec Selection Criterion", fontsize = 16)
plt.boxplot(x= [topN2V, cand], flierprops=flierprops, widths= [0.5,0.5], labels= ['node2vec', 'all candidates'], boxprops = boxprops, 
            meanprops=meanpointprops, meanline=True,
                   showmeans=True, medianprops=medianprops)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel("Distance", fontsize= 16)
plt.savefig("node2vecDistanceMatterBlogs.pdf")
plt.savefig("node2vecDistanceMatterBlogs.png")

<IPython.core.display.Javascript object>

In [252]:
temp1 = candEdgesScores['gain'].to_numpy()
temp2 = topEdgesScores['Node2vec']['gain'].to_numpy()
temp3 = candEdgesScores['expectedGain'].to_numpy()
temp4 = topEdgesScores['Node2vec']['expGain'].to_numpy()

In [261]:
fig = plt.figure()
boxprops = dict(linestyle='-', linewidth=2)
meanpointprops = dict(marker='D', markeredgecolor='black')
medianprops = dict(linestyle='-', linewidth=2.5)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
fig.suptitle("Node2vec Affects fairness", fontsize = 16)
plt.boxplot(x= [temp1, temp2], flierprops=flierprops, widths= [0.5,0.5], labels= ['gain all candidates', 'gain node2vec'], boxprops = boxprops, 
            meanprops=meanpointprops, meanline=True,
                   showmeans=True, medianprops=medianprops)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel("Distance", fontsize= 16)
plt.savefig("node2vecReducesGainRangeBlogs.pdf")
plt.savefig("node2vecReducesGainRangeBlogs.png")

<IPython.core.display.Javascript object>

In [263]:
fig = plt.figure()
boxprops = dict(linestyle='-', linewidth=2)
meanpointprops = dict(marker='D', markeredgecolor='black')
medianprops = dict(linestyle='-', linewidth=2.5)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
fig.suptitle("Node2vec Affects Expected fairness", fontsize = 16)
plt.boxplot(x= [temp3, temp4], flierprops=flierprops, widths= [0.5,0.5], labels= ['expGain all candidates', 'expGain node2vec'], boxprops = boxprops, 
            meanprops=meanpointprops, meanline=True,
                   showmeans=True, medianprops=medianprops)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel("Distance", fontsize= 16)
plt.savefig("node2vecPreserveSomeExpGainBlogs.pdf")
plt.savefig("node2vecPreserveSomeExpGainBlogs.png")

<IPython.core.display.Javascript object>

#### Gain edges analysis.

In [264]:
selEdgesScores['Gain'].head()

Unnamed: 0,Source,Target,node2vecScore,resAllocScore,jaccCoefScore,prefAttScore,adamicAdarScore,gain,expGain
0,1204,54,0.406981,0.0,0.0,30,0.0,0.004701,0.001913
1,1190,54,0.423401,0.0,0.0,22,0.0,0.000657,0.000278
2,701,54,0.238389,0.0,0.0,4,0.0,0.000335,8e-05
3,69,54,0.137086,0.0,0.0,2,0.0,0.000427,5.9e-05
4,1059,54,0.241652,0.0,0.0,8,0.0,0.000329,7.9e-05


In [265]:
selEdgesDist['Gain'].head()

Unnamed: 0,source,target,distance
0,1204,54,4
1,1204,55,4
2,1204,88,4
3,1204,97,4
4,1204,110,4


In [266]:
selEdgesDist['Gain'].describe()

Unnamed: 0,source,target,distance
count,1220.0,1220.0,1220.0
mean,612.065574,230.640164,3.718852
std,369.986115,187.783119,0.951044
min,2.0,4.0,2.0
25%,280.0,88.0,3.0
50%,616.0,110.0,4.0
75%,922.0,332.0,4.0
max,1215.0,615.0,7.0


In [267]:
selEdgesDist['Gain']['target'].value_counts()

55     122
54     122
88     121
332    121
305    121
97     121
514    119
110    119
179    116
615    110
432     12
46       6
115      2
33       2
50       1
44       1
40       1
35       1
22       1
4        1
Name: target, dtype: int64

In [272]:
temp = selEdgesDist['Gain']['target'].value_counts()
topEdgesScores['Gain'] = selEdgesScores['Gain'][selEdgesScores['Gain']['Target'].isin(temp[temp > 100].index[:])]
topEdgesDist['Gain'] = selEdgesDist['Gain'][selEdgesDist['Gain']['target'].isin(temp[temp > 100].index[:])]

In [273]:
topEdgesScores['Gain'].head()

Unnamed: 0,Source,Target,node2vecScore,resAllocScore,jaccCoefScore,prefAttScore,adamicAdarScore,gain,expGain
0,1204,54,0.406981,0.0,0.0,30,0.0,0.004701,0.001913
1,1190,54,0.423401,0.0,0.0,22,0.0,0.000657,0.000278
2,701,54,0.238389,0.0,0.0,4,0.0,0.000335,8e-05
3,69,54,0.137086,0.0,0.0,2,0.0,0.000427,5.9e-05
4,1059,54,0.241652,0.0,0.0,8,0.0,0.000329,7.9e-05


In [274]:
topEdgesDist['Gain'].head()

Unnamed: 0,source,target,distance
0,1204,54,4
1,1204,55,4
2,1204,88,4
3,1204,97,4
4,1204,110,4


In [275]:
topEdgesScores['Gain'].describe()

Unnamed: 0,Source,Target,node2vecScore,resAllocScore,jaccCoefScore,prefAttScore,adamicAdarScore,gain,expGain
count,1192.0,1192.0,1192.0,1192.0,1192.0,1192.0,1192.0,1192.0,1192.0
mean,619.901846,231.065436,0.312296,0.001833,0.005806,65.721477,0.031217,0.00045,0.000146
std,368.083996,187.680662,0.140115,0.013096,0.03988,161.41512,0.153718,0.000869,0.000308
min,2.0,54.0,0.124181,0.0,0.0,1.0,0.0,2e-06,1e-06
25%,287.0,88.0,0.225124,0.0,0.0,4.0,0.0,4.8e-05,1.6e-05
50%,619.0,110.0,0.270624,0.0,0.0,18.0,0.0,0.000202,5e-05
75%,923.0,332.0,0.368779,0.0,0.0,62.0,0.0,0.000422,9.9e-05
max,1215.0,615.0,0.955565,0.333333,1.0,1845.0,1.72366,0.005702,0.00269


In [276]:
topEdgesDist['Gain'].describe()

Unnamed: 0,source,target,distance
count,1192.0,1192.0,1192.0
mean,619.901846,231.065436,3.745805
std,368.083996,187.680662,0.940719
min,2.0,54.0,2.0
25%,287.0,88.0,3.0
50%,619.0,110.0,4.0
75%,923.0,332.0,4.0
max,1215.0,615.0,7.0


In [282]:
temp1 = candEdgesDist['distance'].to_numpy()
temp2 = topEdgesDist['Gain']['distance'].to_numpy()
temp3 = candEdgesScores['gain'].to_numpy()
temp4 = topEdgesScores['Gain']['gain'].to_numpy()
temp5 = candEdgesScores['expectedGain'].to_numpy()
temp6 = topEdgesScores['Gain']['expGain'].to_numpy()

In [283]:
fig = plt.figure()
boxprops = dict(linestyle='-', linewidth=2)
meanpointprops = dict(marker='D', markeredgecolor='black')
medianprops = dict(linestyle='-', linewidth=2.5)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
fig.suptitle("Gain Selection Criterion", fontsize = 16)
plt.boxplot(x= [temp1, temp2], flierprops=flierprops, widths= [0.5,0.5], labels= ['all candidates', 'maxGain'], boxprops = boxprops, 
            meanprops=meanpointprops, meanline=True,
                   showmeans=True, medianprops=medianprops)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel("Distance", fontsize= 16)
plt.savefig("maxGainDistanceDoesntMatterBlogs.pdf")
plt.savefig("maxGainDistanceDoesntMatterBlogs.png")

<IPython.core.display.Javascript object>

In [307]:
fig = plt.figure()
boxprops = dict(linestyle='-', linewidth=2)
meanpointprops = dict(marker='D', markeredgecolor='black')
medianprops = dict(linestyle='-', linewidth=2.5)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
fig.suptitle("Gain Affects fairness", fontsize = 16)
plt.boxplot(x= [temp3, temp4], flierprops=flierprops, widths= [0.5,0.5], labels= ['all candidates', 'maxGain'], boxprops = boxprops, 
            meanprops=meanpointprops, meanline=True,
                   showmeans=True, medianprops=medianprops)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel("Distance", fontsize= 16)
plt.savefig("maxGainMatterGainBlogs.pdf")
plt.savefig("maxGainMatterGainBlogs.png")

<IPython.core.display.Javascript object>

In [308]:
fig = plt.figure()
boxprops = dict(linestyle='-', linewidth=2)
meanpointprops = dict(marker='D', markeredgecolor='black')
medianprops = dict(linestyle='-', linewidth=2.5)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
fig.suptitle("Gain Affects Expected Fairness", fontsize = 16)
plt.boxplot(x= [temp5, temp6], flierprops=flierprops, widths= [0.5,0.5], labels= ['all candidates', 'maxGain'], boxprops = boxprops, 
            meanprops=meanpointprops, meanline=True,
                   showmeans=True, medianprops=medianprops)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel("Distance", fontsize= 16)
plt.savefig("maxGainAffectsExpGainBlogs.pdf")
plt.savefig("maxGainAffectsExpGainBlogs.png")

<IPython.core.display.Javascript object>

In [309]:
temp1

array([2, 3, 3, ..., 4, 3, 4], dtype=int64)

#### Expected gain edge analysis.

In [310]:
selEdgesDist['ExpGain'].describe()

Unnamed: 0,source,target,distance
count,1220.0,1220.0,1220.0
mean,612.065574,296.07377,2.654918
std,369.986115,200.543248,0.753292
min,2.0,0.0,1.0
25%,280.0,126.0,2.0
50%,616.0,282.0,3.0
75%,922.0,441.0,3.0
max,1215.0,906.0,5.0


In [312]:
selEdgesDist['ExpGain']['target'].value_counts()

0      122
282    110
570     97
196     94
444     93
357     91
126     89
398     79
339     75
514     58
54      37
252     34
906     31
55      31
88      25
441     24
179     21
553     18
305     10
97      10
118     10
46       8
435      7
332      7
278      6
110      5
385      5
383      4
22       3
35       2
61       2
50       2
33       2
40       2
4        1
115      1
44       1
103      1
521      1
47       1
Name: target, dtype: int64

In [315]:
temp1 = candEdgesDist['distance'].to_numpy()
temp2 = selEdgesDist['ExpGain']['distance'].to_numpy()
temp3 = candEdgesScores['gain'].to_numpy()
temp4 = selEdgesScores['ExpGain']['gain'].to_numpy()
temp5 = candEdgesScores['expectedGain'].to_numpy()
temp6 = selEdgesScores['ExpGain']['expGain'].to_numpy()

In [316]:
fig = plt.figure()
boxprops = dict(linestyle='-', linewidth=2)
meanpointprops = dict(marker='D', markeredgecolor='black')
medianprops = dict(linestyle='-', linewidth=2.5)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
fig.suptitle("ExpGain Selection Criterion", fontsize = 16)
plt.boxplot(x= [temp1, temp2], flierprops=flierprops, widths= [0.5,0.5], labels= ['all candidates', 'maxExpGain'], boxprops = boxprops, 
            meanprops=meanpointprops, meanline=True,
                   showmeans=True, medianprops=medianprops)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel("Distance", fontsize= 16)
plt.savefig("maxExpGainDistanceMatterBlogs.pdf")
plt.savefig("maxExpGainDistanceMatterBlogs.png")

<IPython.core.display.Javascript object>

In [317]:
fig = plt.figure()
boxprops = dict(linestyle='-', linewidth=2)
meanpointprops = dict(marker='D', markeredgecolor='black')
medianprops = dict(linestyle='-', linewidth=2.5)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
fig.suptitle("ExpGain Affects fairness", fontsize = 16)
plt.boxplot(x= [temp3, temp4], flierprops=flierprops, widths= [0.5,0.5], labels= ['all candidates', 'maxExpGain'], boxprops = boxprops, 
            meanprops=meanpointprops, meanline=True,
                   showmeans=True, medianprops=medianprops)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel("Distance", fontsize= 16)
plt.savefig("maxExpGainMatterGainBlogs.pdf")
plt.savefig("maxExpGainMatterGainBlogs.png")

<IPython.core.display.Javascript object>

In [318]:
fig = plt.figure()
boxprops = dict(linestyle='-', linewidth=2)
meanpointprops = dict(marker='D', markeredgecolor='black')
medianprops = dict(linestyle='-', linewidth=2.5)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none')
fig.suptitle("ExpGain Affects Expected Fairness", fontsize = 16)
plt.boxplot(x= [temp5, temp6], flierprops=flierprops, widths= [0.5,0.5], labels= ['all candidates', 'maxExpGain'], boxprops = boxprops, 
            meanprops=meanpointprops, meanline=True,
                   showmeans=True, medianprops=medianprops)
plt.xticks(fontsize= 16)
plt.yticks(fontsize= 16)
plt.ylabel("Distance", fontsize= 16)
plt.savefig("maxExpGainAffectsExpGainBlogs.pdf")
plt.savefig("maxExpGainAffectsExpGainBlogs.png")

<IPython.core.display.Javascript object>

In [34]:
# Load nodes charakteristics.
nodes = pd.read_csv('nodeQualityFeatures.txt', sep= '\t')
nodes.head()

Unnamed: 0,nodeId,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio
0,0,0.000243,0.564971,1,0,26,0.5,0.923077
1,1,0.000251,0.567501,1,1,44,1.0,1.0
2,2,0.000243,0.420519,1,0,4,0.5,0.75
3,3,0.000243,0.476046,1,0,1,0.5,1.0
4,4,0.000243,0.593033,1,0,1,0.5,1.0


In [35]:
temp = selEdgesDist['Node2vec']['target'].value_counts()
topEdgesScores['Gain'] = selEdgesScores['Gain'][selEdgesScores['Gain']['Target'].isin(temp[temp > 100].index[:])]
node2vecTargets = nodes[nodes['nodeId'].isin(temp[temp > 100].index) ]

In [36]:
node2vecTargets.head()

Unnamed: 0,nodeId,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio
0,0,0.000243,0.564971,1,0,26,0.5,0.923077
357,357,0.002603,0.521215,1,79,70,0.974684,0.957143
496,496,0.006406,0.389185,1,203,71,0.975369,0.478873
565,565,0.010006,0.321328,1,168,50,0.970238,0.16
575,575,0.00605,0.327906,1,88,37,0.965909,0.189189


In [42]:
nodes.describe(percentiles =[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])


Unnamed: 0,nodeId,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio
count,1222.0,1222.0,1222.0,1222.0,1222.0,1222.0,1222.0,1222.0
mean,610.5,0.000818,0.332798,0.48527,13.680033,13.680033,0.518632,0.432953
std,352.905323,0.002225,0.164824,0.499988,22.308387,23.916828,0.425197,0.419469
min,0.0,0.000243,0.0,0.0,0.0,0.0,0.0,0.0
10%,122.1,0.000243,0.149289,0.0,0.0,0.0,0.0,0.0
20%,244.2,0.00025,0.158458,0.0,1.0,1.0,0.0,0.0
30%,366.3,0.000265,0.172811,0.0,2.0,1.0,0.094277,0.0
40%,488.4,0.000289,0.204379,0.0,3.0,2.0,0.333333,0.02451
50%,610.5,0.000331,0.282878,0.0,5.0,4.0,0.5,0.5
60%,732.6,0.000404,0.432878,1.0,8.0,7.0,0.746774,0.5


In [41]:
node2vecTargets.describe(percentiles =[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

Unnamed: 0,nodeId,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,541.1,0.004645,0.323023,0.7,88.7,76.9,0.709222,0.277833
std,226.911363,0.002701,0.14048,0.483046,59.159014,74.640546,0.338252,0.378486
min,0.0,0.000243,0.161032,0.0,0.0,8.0,0.069767,0.0
10%,321.3,0.001864,0.163151,0.0,38.7,23.3,0.297602,0.0
20%,468.2,0.002491,0.169762,0.0,48.6,25.8,0.448583,0.003101
30%,544.3,0.003569,0.263759,0.7,61.2,33.7,0.494,0.02021
40%,571.0,0.004383,0.305237,1.0,73.8,44.8,0.7,0.034261
50%,578.0,0.004722,0.313908,1.0,83.5,60.0,0.899621,0.09948
60%,595.8,0.005143,0.323959,1.0,90.4,70.4,0.967641,0.171676
