In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import os.path
from statistics import median
import sys
from os import path
sns.set_theme(style="whitegrid")

In [2]:
def get_target_nodes_features(dataset, policy):
    # To get distance ##################
    f = open(f"{dataset}/out_graph.txt")
    lines = f.readlines()
    f.close()
    G = nx.parse_edgelist(lines[1:], nodetype=int, create_using=nx.DiGraph)
    trgs = dict()
    min_d = dict()
    sum_d = dict()
    max_d = dict()
    no_exist = dict()
    median_d = dict()
    ####################################
    
    round_stats = pd.read_csv(f"{dataset}/sc_{policy}.csv").drop(axis=0, labels=[0])
    target_nodes = dict()
    for index, row in round_stats.iterrows():
        edges = eval(row["Edges"])
        # ---------------------------------------------------
        for edge in edges:
            target = edge[1]
            if target not in target_nodes:
                target_nodes[target] = [0 for i in range(10)]
            target_nodes[target][row["Rounds"]] += 1
        # ---------------------------------------------------
        # To get distance #####################
        for e in edges:
            trgs.setdefault(e[1], 0)
            min_d.setdefault(e[1], sys.maxsize)
            sum_d.setdefault(e[1], 0)
            max_d.setdefault(e[1], 0)
            no_exist.setdefault(e[1], 0)
            median_d.setdefault(e[1], list())
            trgs[e[1]] += 1
            try:
                length = len(nx.shortest_path(G, source=e[1], target=e[0])) - 1
                sum_d[e[1]] += length
                if min_d[e[1]] > length: min_d[e[1]] = length
                if max_d[e[1]] < length: max_d[e[1]] = length
                median_d[e[1]].append(length)
            except nx.NetworkXNoPath:
                no_exist[e[1]] += 1
        #######################################
    totals = dict()
    for node in target_nodes:
        totals[node] = sum(target_nodes[node])
    nodes = list(totals.keys())
    values = list(totals.values())
    dd = {
        "Nodes": nodes,
        "times": values
    }
    df = pd.DataFrame(dd)
    nodes_info = pd.read_csv(f"{dataset}/nodeQualityFeatures.txt", sep="\t")
    nodes_info['out_homophily'] = [row['redNeighborsOutRatio'] if row['group'] == 1 else 1 - row['redNeighborsOutRatio']
                               for i, row in nodes_info.iterrows()]
    df = df.join(nodes_info.set_index("nodeId"), on="Nodes")
    
    # To get distance ########################
    data = list()
    for k,v in trgs.items():
        temp_median = median(median_d[k]) if median_d[k] else float("NaN")
        data.append([k, min_d[k], max_d[k], round(sum_d[k]/trgs[k],3), temp_median, no_exist[k]])
    df_d = pd.DataFrame(data, columns=['Nodes', 'MinDistance', "MaxDistance", "AverageDistance", "MedianDistance", "NoExistsDistance"])
    df = df.join(df_d.set_index("Nodes"), on="Nodes")
    ##########################################
    
    df = df.sort_values(axis=0, by="times", ascending=False)
    
    return df

# Node2vec

In [5]:
books = get_target_nodes_features("books", "node2vec")
blogs = get_target_nodes_features("blogs", "node2vec")
dblp = get_target_nodes_features("dblp_course", "node2vec")
dblp_n2v = get_target_nodes_features("dblp_course", "node2vec")
twitter = get_target_nodes_features("twitter", "node2vec")

In [4]:
blogs_n2v.head()

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
0,0,120,0.000244,0.565289,1,0,26,0.5,0.923077,0.923077,2,4,2.217,2.0,19
6,252,94,0.001305,0.505282,1,67,103,0.985075,0.834951,0.834951,2,5,1.734,3.0,31
4,542,86,0.002178,0.431412,1,57,5,1.0,1.0,1.0,2,7,1.407,3.0,51
8,583,81,0.00551,0.295485,1,109,24,0.990826,0.0,0.0,2,4,0.951,2.0,50
10,282,81,0.001782,0.551821,1,79,92,1.0,0.978261,0.978261,2,6,1.765,3.0,28


In [5]:
books.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,52.26087,3.913043,0.019081,0.518181,0.521739,15.26087,15.26087,0.521251,0.521251,0.956033,2.173913,3.478261,2.796174,2.76087,0.0
std,26.377017,2.043016,0.005917,0.445531,0.510754,5.370097,5.370097,0.478876,0.478876,0.111241,0.491026,0.790257,0.54391,0.720946,0.0
min,2.0,1.0,0.008221,0.01994,0.0,5.0,5.0,0.0,0.0,0.5,2.0,2.0,2.0,2.0,0.0
50%,50.0,4.0,0.018066,0.772748,1.0,15.0,15.0,0.8,0.8,1.0,2.0,4.0,2.833,2.5,0.0
max,91.0,7.0,0.028284,0.970356,1.0,24.0,24.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [6]:
books_top = books.loc[books['times'] >= 6]
temp = books_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((books_top.times * books_top.group) / sum(books_top.times)):.6f}")

2.989857 & 3.071429 & 3.714286 & 0.000000 & 0.019558 & 0.018066 & 0.028284 & 0.352294 & 0.030159 & 0.966886 & 0.919048 & 1.000000 & 1.000000 & 0.279070


In [7]:
blogs.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,667.122449,24.897959,0.005242,0.368015,0.673469,62.204082,33.122449,0.700698,0.511324,0.792839,4.51757e+18,1.734694,0.540551,2.3,15.244898
std,338.924379,32.338216,0.009037,0.132603,0.473804,36.609413,56.840652,0.407337,0.422285,0.30152,4.658506e+18,2.069375,0.70799,0.456435,16.476664
min,0.0,1.0,0.000244,0.151416,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0
50%,546.0,8.0,0.001986,0.406004,1.0,61.0,16.0,0.985075,0.642857,0.931034,3.0,2.0,0.136,2.0,7.0
max,1214.0,120.0,0.045289,0.565289,1.0,178.0,287.0,1.0,1.0,1.0,9.223372e+18,7.0,2.333,3.0,55.0


In [8]:
blogs_top = blogs.loc[blogs['times'] >= 32]
temp = blogs_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((blogs_top.times * blogs_top.group) / sum(blogs_top.times)):.6f}")

1.048538 & 2.500000 & 4.000000 & 38.769231 & 0.010570 & 0.002295 & 0.045289 & 0.398312 & 0.431412 & 0.565289 & 0.723372 & 0.834951 & 1.000000 & 0.893122


In [9]:
dblp.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,6981.016647,15.453032,0.000234,0.153044,0.147444,19.142687,19.142687,0.154443,0.154443,0.753306,3.208086,3.608799,3.498027,3.545779,0.0
std,2860.974881,67.600387,0.000188,0.101888,0.354758,18.709906,18.709906,0.12552,0.12552,0.266584,0.828499,0.69376,0.683112,0.716296,0.0
min,5.0,1.0,2.6e-05,0.006216,0.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0
50%,6968.0,2.0,0.000187,0.126355,0.0,13.0,13.0,0.142857,0.142857,0.833333,3.0,4.0,3.889,4.0,0.0
max,13005.0,718.0,0.002166,0.601685,1.0,205.0,205.0,0.666667,0.666667,1.0,4.0,4.0,4.0,4.0,0.0


In [10]:
dblp_top = dblp.loc[dblp['times'] >= 9]
temp = dblp_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((dblp_top.times * dblp_top.group) / sum(dblp_top.times)):.6f}")

3.779097 & 3.986726 & 4.000000 & 0.000000 & 0.000476 & 0.000404 & 0.002166 & 0.147623 & 0.128347 & 0.447617 & 0.770683 & 0.834146 & 1.000000 & 0.119201


In [11]:
twitter.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,8973.142523,43.10514,0.00025,0.608257,0.719626,21.065421,18.432243,0.670496,0.629914,0.855373,7.908826e+18,1.053738,0.295119,5.819672,27.964953
std,5291.087694,210.627379,0.000338,0.163111,0.449708,29.550186,59.461066,0.422388,0.401387,0.226811,3.228136e+18,2.884897,0.993105,2.059597,126.605164
min,0.0,1.0,3.5e-05,0.176127,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0
50%,8883.0,2.0,0.000108,0.639552,1.0,7.0,2.0,1.0,0.5,1.0,9.223372e+18,0.0,0.0,6.0,2.0
max,18461.0,1678.0,0.003275,0.837937,1.0,204.0,785.0,1.0,1.0,1.0,9.223372e+18,15.0,9.0,9.0,1032.0


In [12]:
twitter_top = twitter.loc[twitter['times'] >= 37]
temp = twitter_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((twitter_top.times * twitter_top.group) / sum(twitter_top.times)):.6f}")

1.746700 & 6.696429 & 9.333333 & 365.266667 & 0.000873 & 0.000671 & 0.003275 & 0.567982 & 0.662834 & 0.817765 & 0.910210 & 1.000000 & 1.000000 & 0.527677


# Fair

In [6]:
books = get_target_nodes_features("books", "fair")
blogs = get_target_nodes_features("blogs", "fair")
dblp = get_target_nodes_features("dblp_course", "fair")
dblp_fair = get_target_nodes_features("dblp_course", "fair")
twitter = get_target_nodes_features("twitter", "fair")

In [14]:
books.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,50.222222,2.5,0.011711,0.918747,0.972222,8.888889,8.888889,0.948541,0.948541,0.939282,3.027778,3.5,3.24125,3.194444,0.0
std,34.419356,2.299068,0.006092,0.082212,0.166667,5.548631,5.548631,0.098344,0.098344,0.134636,0.9706,0.696932,0.785752,0.847499,0.0
min,0.0,1.0,0.005531,0.591574,0.0,3.0,3.0,0.666667,0.666667,0.333333,2.0,2.0,2.0,2.0,0.0
50%,70.5,1.0,0.008927,0.952098,1.0,6.5,6.5,1.0,1.0,1.0,3.0,4.0,3.0,3.0,0.0
max,91.0,7.0,0.026273,0.973552,1.0,22.0,22.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [15]:
books_top = books.loc[books['times'] >= 5]
temp = books_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((books_top.times * books_top.group) / sum(books_top.times)):.6f}")

2.568500 & 2.400000 & 3.300000 & 0.000000 & 0.007763 & 0.007787 & 0.010310 & 0.969345 & 0.969804 & 0.973552 & 1.000000 & 1.000000 & 1.000000 & 1.000000


In [16]:
blogs.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,178.137931,42.068966,0.000272,0.603617,1.0,1.862069,9.896552,0.793103,0.999425,0.999425,2.862426e+18,3.793103,2.118966,4.375,19.689655
std,158.940178,54.429464,6.6e-05,0.016804,0.0,3.215189,18.908322,0.250615,0.003095,0.003095,4.342581e+18,2.87078,1.804557,0.958,30.552184
min,12.0,1.0,0.000244,0.57435,1.0,0.0,1.0,0.5,0.983333,0.983333,2.0,0.0,0.0,2.0,0.0
50%,115.0,3.0,0.000248,0.600076,1.0,1.0,2.0,1.0,1.0,1.0,4.0,5.0,2.24,4.0,1.0
max,615.0,121.0,0.000584,0.639208,1.0,13.0,76.0,1.0,1.0,1.0,9.223372e+18,8.0,6.0,6.0,121.0


In [17]:
blogs_top = blogs.loc[blogs['times'] >= 114]
temp = blogs_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((blogs_top.times * blogs_top.group) / sum(blogs_top.times)):.6f}")

2.024625 & 4.285714 & 5.625000 & 57.750000 & 0.000305 & 0.000259 & 0.000584 & 0.622179 & 0.620824 & 0.639208 & 1.000000 & 1.000000 & 1.000000 & 1.000000


In [18]:
dblp.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,6419.792513,8.246193,9.6e-05,0.332178,0.713198,7.334391,7.334391,0.339489,0.339489,0.43018,3.357234,3.769036,3.65676,3.697652,0.0
std,3705.253461,24.154062,0.000106,0.136281,0.452412,10.31383,10.31383,0.283825,0.283825,0.318526,0.779042,0.556593,0.568889,0.589959,0.0
min,5.0,1.0,1.9e-05,0.007652,0.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0
50%,6386.0,2.0,6.8e-05,0.340842,1.0,4.0,4.0,0.307692,0.307692,0.4,4.0,4.0,4.0,4.0,0.0
max,12997.0,327.0,0.002166,0.797311,1.0,205.0,205.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [19]:
dblp_top = dblp.loc[dblp['times'] >= 24]
temp = dblp_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((dblp_top.times * dblp_top.group) / sum(dblp_top.times)):.6f}")

3.886843 & 4.000000 & 4.000000 & 0.000000 & 0.000094 & 0.000065 & 0.000770 & 0.506018 & 0.511957 & 0.682384 & 0.644511 & 0.600000 & 1.000000 & 0.995308


In [20]:
twitter.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,8901.881449,20.251372,9.3e-05,0.711314,0.913282,6.177827,10.594951,0.706473,0.86347,0.868057,7.208607e+18,1.789243,0.670958,7.261307,14.50933
std,5413.434385,107.548392,0.000161,0.093865,0.281576,14.090336,41.453317,0.321914,0.225761,0.218193,3.813084e+18,3.78074,1.693959,2.801886,73.382182
min,1.0,1.0,3.5e-05,0.348939,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0
50%,9083.0,1.0,4e-05,0.734956,1.0,1.0,1.0,0.833333,1.0,1.0,9.223372e+18,0.0,0.0,8.0,1.0
max,18452.0,1352.0,0.002654,1.0,1.0,150.0,785.0,1.0,1.0,1.0,9.223372e+18,16.0,13.0,13.0,983.0


In [21]:
twitter_top = twitter.loc[twitter['times'] >= 11]
temp = twitter_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((twitter_top.times * twitter_top.group) / sum(twitter_top.times)):.6f}")

1.975828 & 8.596154 & 9.086022 & 125.387097 & 0.000161 & 0.000078 & 0.001412 & 0.815022 & 0.808553 & 1.000000 & 1.000000 & 1.000000 & 1.000000 & 1.000000


# Hybrid

In [7]:
books = get_target_nodes_features("books", "hybrid_node2vec")
blogs = get_target_nodes_features("blogs", "hybrid_node2vec")
dblp = get_target_nodes_features("dblp_course", "hybrid_node2vec")
dblp_h = get_target_nodes_features("dblp_course", "hybrid_node2vec")
twitter = get_target_nodes_features("twitter", "hybrid_node2vec")

In [23]:
books.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,48.363636,2.727273,0.011903,0.916996,0.969697,9.0,9.0,0.943863,0.943863,0.933762,2.666667,3.333333,2.981879,2.939394,0.0
std,34.545096,1.92472,0.006334,0.085739,0.174078,5.787918,5.787918,0.101525,0.101525,0.139461,0.853913,0.692219,0.709005,0.807751,0.0
min,0.0,1.0,0.005531,0.591574,0.0,3.0,3.0,0.666667,0.666667,0.333333,2.0,2.0,2.0,2.0,0.0
50%,70.0,2.0,0.008832,0.955357,1.0,6.0,6.0,1.0,1.0,1.0,2.0,3.0,3.0,3.0,0.0
max,91.0,7.0,0.026273,0.973552,1.0,22.0,22.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [24]:
books_top = books.loc[books['times'] >= 4]
temp = books_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((books_top.times * books_top.group) / sum(books_top.times)):.6f}")

2.445636 & 2.318182 & 3.000000 & 0.000000 & 0.009177 & 0.008084 & 0.016523 & 0.968127 & 0.969590 & 0.973552 & 1.000000 & 1.000000 & 1.000000 & 1.000000


In [25]:
blogs.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,313.222222,33.888889,0.000839,0.564549,1.0,28.916667,31.555556,0.900133,0.971926,0.971926,1.024819e+18,3.361111,1.524694,2.765625,15.472222
std,163.736218,34.172485,0.000649,0.041893,0.0,29.444742,53.164674,0.199508,0.045447,0.045447,2.939744e+18,1.742786,0.894345,0.659721,17.830129
min,0.0,1.0,0.000244,0.504504,1.0,0.0,1.0,0.5,0.834951,0.834951,2.0,0.0,0.0,2.0,0.0
50%,342.0,24.0,0.000548,0.559754,1.0,21.0,11.5,1.0,1.0,1.0,2.0,3.5,1.8475,3.0,12.5
max,615.0,120.0,0.002627,0.639208,1.0,88.0,287.0,1.0,1.0,1.0,9.223372e+18,6.0,4.0,4.0,80.0


In [26]:
blogs_top = blogs.loc[blogs['times'] >= 52]
temp = blogs_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((blogs_top.times * blogs_top.group) / sum(blogs_top.times)):.6f}")

1.499667 & 2.714286 & 4.000000 & 37.333333 & 0.001275 & 0.001305 & 0.002627 & 0.552998 & 0.551821 & 0.621525 & 0.939249 & 0.952381 & 1.000000 & 1.000000


In [27]:
dblp.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,6562.735007,9.062762,0.000115,0.314419,0.660391,8.981172,8.981172,0.299622,0.299622,0.442503,3.317992,3.691771,3.58452,3.622734,0.0
std,3587.824403,36.205588,0.000126,0.14214,0.473742,12.110462,12.110462,0.266189,0.266189,0.328219,0.802559,0.632766,0.632742,0.656918,0.0
min,2.0,1.0,1.9e-05,0.007652,0.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0
50%,6579.5,1.0,7.6e-05,0.318966,1.0,5.0,5.0,0.25,0.25,0.4,4.0,4.0,4.0,4.0,0.0
max,13005.0,554.0,0.002166,0.797311,1.0,205.0,205.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [28]:
dblp_top = dblp.loc[dblp['times'] >= 9]
temp = dblp_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((dblp_top.times * dblp_top.group) / sum(dblp_top.times)):.6f}")

3.835180 & 3.981481 & 4.000000 & 0.000000 & 0.000189 & 0.000148 & 0.000770 & 0.442697 & 0.425074 & 0.706439 & 0.391300 & 0.333333 & 1.000000 & 1.000000


In [29]:
twitter.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,9055.825714,26.355714,0.000136,0.689268,0.885714,9.872857,12.255714,0.736977,0.819303,0.827074,7.378698e+18,1.598571,0.57214,6.910714,18.828571
std,5325.885292,146.989856,0.000245,0.096703,0.318385,19.763468,47.431439,0.338806,0.245931,0.235482,3.691987e+18,3.583502,1.504112,2.799838,101.810179
min,2.0,1.0,3.5e-05,0.348939,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0
50%,9009.5,1.0,5.3e-05,0.698361,1.0,2.0,1.0,1.0,1.0,1.0,9.223372e+18,0.0,0.0,7.5,1.0
max,18461.0,1607.0,0.003275,1.0,1.0,204.0,785.0,1.0,1.0,1.0,9.223372e+18,15.0,12.0,14.0,1023.0


In [30]:
twitter_top = twitter.loc[twitter['times'] >= 24]
temp = twitter_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((twitter_top.times * twitter_top.group) / sum(twitter_top.times)):.6f}")

2.234157 & 8.100000 & 9.607843 & 230.450980 & 0.000388 & 0.000241 & 0.002654 & 0.812772 & 0.794099 & 1.000000 & 1.000000 & 1.000000 & 1.000000 & 1.000000


# Balanced

In [8]:
books = get_target_nodes_features("books", "dyadic_fair")
blogs = get_target_nodes_features("blogs", "dyadic_fair")
dblp = get_target_nodes_features("dblp_course", "dyadic_fair")
twitter = get_target_nodes_features("twitter", "dyadic_fair")

In [32]:
books.describe()

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,48.295082,1.47541,0.01224,0.48681,0.47541,9.393443,9.393443,0.479194,0.479194,0.946407,3.803279,3.852459,3.827869,3.827869,0.0
std,27.830046,0.744011,0.006006,0.432452,0.503539,5.438378,5.438378,0.468067,0.468067,0.130101,0.400819,0.357588,0.363837,0.363837,0.0
min,0.0,1.0,0.004968,0.018669,0.0,3.0,3.0,0.0,0.0,0.333333,3.0,3.0,3.0,3.0,0.0
25%,24.0,1.0,0.008084,0.029775,0.0,6.0,6.0,0.0,0.0,1.0,4.0,4.0,4.0,4.0,0.0
50%,50.0,1.0,0.010117,0.46624,0.0,8.0,8.0,0.5,0.5,1.0,4.0,4.0,4.0,4.0,0.0
75%,73.0,2.0,0.016242,0.944209,1.0,13.0,13.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0
max,91.0,3.0,0.028284,0.973552,1.0,24.0,24.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [33]:
books_top = books.loc[books["times"] >= 2]
temp = books_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((books_top.times * books_top.group) / sum(books_top.times)):.6f}")

3.825000 & 3.825000 & 3.900000 & 0.000000 & 0.012906 & 0.010509 & 0.028284 & 0.233525 & 0.056736 & 0.772748 & 0.875714 & 1.000000 & 1.000000 & 0.122449


In [34]:
blogs.describe()

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
count,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0,26.0,47.0
mean,297.553191,25.957447,0.000267,0.423853,0.617021,1.319149,6.617021,0.617021,0.616667,0.999645,4.121081e+18,2.914894,1.547447,4.442308,16.297872
std,250.959187,26.383425,5.9e-05,0.231049,0.491369,2.638399,15.364489,0.333179,0.491092,0.002431,4.63509e+18,2.819419,1.954632,0.668235,22.096542
min,12.0,1.0,0.000244,0.109263,0.0,0.0,1.0,0.0,0.0,0.983333,2.0,0.0,0.0,4.0,0.0
25%,92.5,1.0,0.000244,0.137264,0.0,0.0,1.0,0.5,0.0,1.0,4.0,0.0,0.0,4.0,1.0
50%,217.0,10.0,0.000244,0.589539,1.0,0.0,1.0,0.5,1.0,1.0,5.0,4.0,0.068,4.0,6.0
75%,461.5,55.0,0.000258,0.605719,1.0,1.5,2.0,1.0,1.0,1.0,9.223372e+18,5.0,3.3795,5.0,14.0
max,910.0,62.0,0.000584,0.639208,1.0,13.0,76.0,1.0,1.0,1.0,9.223372e+18,8.0,6.0,6.0,62.0


In [35]:
blogs_top = blogs.loc[blogs["times"] >= 55]
temp = blogs_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((blogs_top.times * blogs_top.group) / sum(blogs_top.times)):.6f}")

1.896615 & 4.300000 & 4.384615 & 32.923077 & 0.000284 & 0.000251 & 0.000584 & 0.435071 & 0.615364 & 0.639208 & 1.000000 & 1.000000 & 1.000000 & 0.614173


In [36]:
blogs_top.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,384.461538,58.615385,0.000284,0.435071,0.615385,2.0,1.230769,0.538462,0.615385,1.0,2.12847e+18,4.384615,1.896615,4.3,32.923077
std,305.082124,2.256046,9.3e-05,0.246416,0.50637,3.488075,0.438529,0.431158,0.50637,0.0,4.044716e+18,2.873397,1.81775,0.674949,25.078083
min,54.0,55.0,0.000244,0.132363,0.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,4.0,8.0
50%,332.0,59.0,0.000251,0.615364,1.0,1.0,1.0,0.5,1.0,1.0,4.0,5.0,3.089,4.0,14.0
max,910.0,62.0,0.000584,0.639208,1.0,13.0,2.0,1.0,1.0,1.0,9.223372e+18,8.0,4.255,6.0,62.0


In [37]:
dblp.describe()

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
count,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0,2722.0
mean,6619.372153,4.774431,9.1e-05,0.191442,0.360397,6.86701,6.86701,0.172938,0.172938,0.720301,3.634093,3.841293,3.780159,3.801616,0.0
std,3731.213528,12.978419,8.5e-05,0.171816,0.480204,8.343504,8.343504,0.257996,0.257996,0.353582,0.62284,0.453432,0.473701,0.480556,0.0
min,5.0,1.0,1.9e-05,0.003426,0.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0
25%,3348.75,1.0,5.1e-05,0.04746,0.0,3.0,3.0,0.0,0.0,0.4,3.0,4.0,3.875,4.0,0.0
50%,6715.0,2.0,6.9e-05,0.098112,0.0,4.0,4.0,0.0,0.0,1.0,4.0,4.0,4.0,4.0,0.0
75%,9861.5,3.0,9.7e-05,0.344811,1.0,8.0,8.0,0.3125,0.3125,1.0,4.0,4.0,4.0,4.0,0.0
max,12997.0,228.0,0.001032,0.730027,1.0,117.0,117.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [38]:
dblp_top = dblp.loc[dblp["times"] > 7]
temp = dblp_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((dblp_top.times * dblp_top.group) / sum(dblp_top.times)):.6f}")

3.922380 & 4.000000 & 4.000000 & 0.000000 & 0.000103 & 0.000075 & 0.000770 & 0.321801 & 0.431565 & 0.706439 & 0.739119 & 0.750000 & 1.000000 & 0.763812


In [39]:
dblp_top.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,6355.199377,25.205607,0.000103,0.321801,0.635514,7.663551,7.663551,0.389003,0.389003,0.739119,3.345794,4.0,3.92238,4.0,0.0
std,3626.466219,30.658728,9.7e-05,0.228817,0.482037,8.417553,8.417553,0.347387,0.347387,0.2751,0.638888,0.0,0.101026,0.0,0.0
min,31.0,8.0,1.9e-05,0.009049,0.0,1.0,1.0,0.0,0.0,0.0,2.0,4.0,3.524,4.0,0.0
50%,6394.0,15.0,7.5e-05,0.431565,1.0,5.0,5.0,0.4,0.4,0.75,3.0,4.0,3.95,4.0,0.0
max,12856.0,228.0,0.00077,0.706439,1.0,61.0,61.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [40]:
twitter.describe()

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
count,1463.0,1463.0,1463.0,1463.0,1463.0,1463.0,1463.0,1463.0,1463.0,1463.0,1463.0,1463.0,1463.0,347.0,1463.0
mean,9272.10458,12.61039,0.000103,0.522286,0.507861,7.820916,11.48257,0.53248,0.412112,0.807724,7.035737e+18,2.124402,0.831466,8.021614,8.693096
std,5410.791478,49.758087,0.000167,0.189979,0.500109,15.879356,39.531865,0.411113,0.381218,0.24145,3.924558e+18,3.954543,1.941218,1.670111,32.955378
min,1.0,1.0,3.5e-05,0.149708,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0
25%,4510.0,1.0,3.6e-05,0.336549,0.0,1.0,0.0,0.023306,0.0,0.5,9.223372e+18,0.0,0.0,7.0,1.0
50%,9217.0,2.0,4.3e-05,0.602443,1.0,2.0,1.0,0.5,0.5,1.0,9.223372e+18,0.0,0.0,8.0,1.0
75%,14090.5,5.0,8.7e-05,0.639552,1.0,7.0,5.0,1.0,0.5,1.0,9.223372e+18,0.0,0.0,9.0,4.0
max,18465.0,709.0,0.002654,1.0,1.0,150.0,785.0,1.0,1.0,1.0,9.223372e+18,16.0,13.0,13.0,621.0


In [41]:
twitter_top = twitter.loc[twitter["times"] >= 11]
temp = twitter_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((twitter_top.times * twitter_top.group) / sum(twitter_top.times)):.6f}")

2.301828 & 8.097561 & 8.390625 & 52.802083 & 0.000186 & 0.000076 & 0.001822 & 0.461351 & 0.324677 & 1.000000 & 0.975451 & 1.000000 & 1.000000 & 0.430407


In [42]:
twitter_top.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,9166.057292,81.088542,0.000186,0.461351,0.333333,15.953125,7.145833,0.372754,0.330104,0.975451,1.345075e+18,8.390625,2.301828,8.097561,52.802083
std,5421.470963,116.185524,0.000259,0.258634,0.472637,24.450246,19.86735,0.410515,0.459455,0.113475,3.263799e+18,4.034993,1.866008,1.486006,77.722169
min,10.0,11.0,3.5e-05,0.149708,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,4.0
50%,9067.0,33.0,7.6e-05,0.324677,0.0,5.0,1.0,0.118055,0.0,1.0,7.0,9.0,2.0945,8.0,23.5
max,18465.0,709.0,0.001822,1.0,1.0,137.0,145.0,1.0,1.0,1.0,9.223372e+18,16.0,7.433,12.0,621.0


# Hybrid Balanced.

In [9]:
books = get_target_nodes_features("books", "hybrid_balanced_node2vec")
blogs = get_target_nodes_features("blogs", "hybrid_balanced_node2vec")
dblp = get_target_nodes_features("dblp_course", "hybrid_balanced_node2vec")
twitter = get_target_nodes_features("twitter", "hybrid_balanced_node2vec")

In [44]:
books.describe().iloc[[1, 2, 3, 5, 7]]


Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,52.585366,2.195122,0.014719,0.485332,0.463415,11.365854,11.365854,0.469045,0.469045,0.920264,3.268293,3.853659,3.603659,3.658537,0.0
std,26.525436,1.100443,0.007033,0.412346,0.504854,6.374779,6.374779,0.450897,0.450897,0.152488,0.671729,0.357839,0.404745,0.439304,0.0
min,1.0,1.0,0.004968,0.01994,0.0,3.0,3.0,0.0,0.0,0.333333,2.0,3.0,3.0,3.0,0.0
50%,52.0,2.0,0.014335,0.46624,0.0,10.0,10.0,0.5,0.5,1.0,3.0,4.0,3.75,4.0,0.0
max,91.0,4.0,0.028284,0.970356,1.0,24.0,24.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [45]:
books_top = books.loc[books["times"] >= 3]
temp = books_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((books_top.times * books_top.group) / sum(books_top.times)):.6f}")

3.659091 & 3.863636 & 4.000000 & 0.000000 & 0.020819 & 0.020224 & 0.028284 & 0.119366 & 0.030159 & 0.917100 & 0.993939 & 1.000000 & 1.000000 & 0.071429


In [46]:
blogs.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,578.706897,21.034483,0.002499,0.35964,0.5,43.517241,44.482759,0.628432,0.488783,0.979816,3.816568e+18,2.034483,1.054155,2.676471,13.034483
std,307.797895,20.489591,0.005518,0.199118,0.504367,37.956689,56.656317,0.381027,0.485259,0.036642,4.582299e+18,2.034488,1.206297,0.588814,15.037923
min,0.0,1.0,0.000244,0.128545,0.0,0.0,1.0,0.0,0.0,0.834951,2.0,0.0,0.0,2.0,0.0
50%,561.5,13.0,0.001303,0.372818,0.5,35.0,28.0,0.688041,0.451958,1.0,3.0,2.0,0.1095,3.0,10.0
max,1212.0,62.0,0.03905,0.621525,1.0,178.0,287.0,1.0,1.0,1.0,9.223372e+18,6.0,4.0,4.0,54.0


In [47]:
blogs_top = blogs.loc[blogs["times"] >= 37]
temp = blogs_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((blogs_top.times * blogs_top.group) / sum(blogs_top.times)):.6f}")

1.284800 & 2.769231 & 3.600000 & 27.400000 & 0.002025 & 0.001377 & 0.006449 & 0.358408 & 0.505282 & 0.565289 & 0.953308 & 0.969697 & 1.000000 & 0.570505


In [48]:
dblp.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,6884.400814,7.560209,0.000156,0.206821,0.380454,12.440372,12.440372,0.178832,0.178832,0.652506,3.450262,3.757417,3.673721,3.707388,0.0
std,3267.350154,25.160712,0.00015,0.159633,0.48564,14.721624,14.721624,0.211089,0.211089,0.35284,0.720358,0.555344,0.562106,0.576669,0.0
min,2.0,1.0,2.2e-05,0.003426,0.0,1.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0
50%,6881.0,2.0,0.000109,0.152665,0.0,8.0,8.0,0.122449,0.122449,0.8,4.0,4.0,4.0,4.0,0.0
max,13005.0,383.0,0.002166,0.706439,1.0,205.0,205.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,0.0


In [49]:
dblp_top = dblp.loc[dblp["times"] > 11]
temp = dblp_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((dblp_top.times * dblp_top.group) / sum(dblp_top.times)):.6f}")

3.865278 & 4.000000 & 4.000000 & 0.000000 & 0.000261 & 0.000212 & 0.001032 & 0.272132 & 0.336892 & 0.682384 & 0.616482 & 0.600000 & 1.000000 & 0.691926


In [50]:
twitter.describe().iloc[[1, 2, 3, 5, 7]]

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
mean,9265.300664,15.32309,0.000138,0.551941,0.602159,11.098007,12.608804,0.605886,0.442391,0.729517,7.216293e+18,1.871262,0.753255,7.646947,10.494186
std,5388.03648,66.664888,0.000213,0.164139,0.489656,20.046339,44.454398,0.433296,0.331982,0.2466,3.807323e+18,3.704039,1.90049,1.698086,43.51395
min,8.0,1.0,3.5e-05,0.149708,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0
50%,9250.5,1.0,4.7e-05,0.639552,1.0,2.0,1.0,0.914216,0.5,0.620193,9.223372e+18,0.0,0.0,8.0,1.0
max,18465.0,709.0,0.002654,1.0,1.0,150.0,785.0,1.0,1.0,1.0,9.223372e+18,15.0,13.0,14.0,621.0


In [51]:
twitter_top = twitter.loc[twitter["times"] >= 10]
temp = twitter_top.describe().iloc[[1, 2, 3, 5, 7]]
print(f"{temp.AverageDistance['mean']:.6f} & {temp.MedianDistance['mean']:.6f} & {temp.MaxDistance['mean']:.6f} & {temp.NoExistsDistance['mean']:.6f}\
 & {temp['pagerank']['mean']:.6f} & {temp['pagerank']['50%']:.6f} & {temp['pagerank']['max']:.6f}\
 & {temp['redPagerank']['mean']:.6f} & {temp['redPagerank']['50%']:.6f} & {temp['redPagerank']['max']:.6f}\
 & {temp['out_homophily']['mean']:.6f} & {temp['out_homophily']['50%']:.6f} & {temp['out_homophily']['max']:.6f}\
 & {sum((twitter_top.times * twitter_top.group) / sum(twitter_top.times)):.6f}")

2.268079 & 7.809353 & 8.042683 & 65.567073 & 0.000353 & 0.000240 & 0.002654 & 0.484619 & 0.357163 & 1.000000 & 0.975267 & 1.000000 & 1.000000 & 0.421515


In [52]:
twitter_top

Unnamed: 0,Nodes,times,pagerank,redPagerank,group,inDegree,outDegree,redNeighborsInRatio,redNeighborsOutRatio,out_homophily,MinDistance,MaxDistance,AverageDistance,MedianDistance,NoExistsDistance
3,1320,709,0.000671,0.176127,0,70,1,0.014286,0.000000,1.000000,3,10,2.329,7.0,471
19,4826,670,0.000766,0.817765,1,92,1,0.967391,1.000000,1.000000,7,15,5.157,10.0,330
11,6452,663,0.001822,0.207209,0,102,5,0.019608,0.000000,1.000000,2,9,2.157,6.0,423
2,16720,621,0.000298,1.000000,1,5,1,0.600000,1.000000,1.000000,9223372036854775807,0,0.000,,621
15,17325,598,0.000577,0.286950,0,71,1,0.014085,0.000000,1.000000,7,13,2.532,10.0,439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,5455,10,0.000759,0.348784,0,112,84,0.017857,0.011905,0.988095,6,6,0.600,6.0,9
230,15440,10,0.000344,0.728516,1,41,7,0.878049,1.000000,1.000000,9223372036854775807,0,0.000,,10
332,6548,10,0.000086,0.335146,0,8,2,0.250000,0.000000,1.000000,9223372036854775807,0,0.000,,10
212,6813,10,0.000075,0.304402,0,9,1,0.111111,0.000000,1.000000,9223372036854775807,0,0.000,,10


# Names

In [18]:
dblp_names.head()

Unnamed: 0,Node_id,Author_Name
0,0,Frank Hing-Wah Luk
1,1,David Nilsson
2,2,Emelie Kullmann
3,3,Gabriel Isheden
4,4,Martin Bohman


In [22]:
dblp_names = pd.read_csv("dblp_course/names.txt", sep="\t")
dblp_n2v_top = dblp_n2v.head(20).join(dblp_names.set_index("Node_id"), on="Nodes")[["Author_Name", "times"]]
dblp_fair_top = dblp_fair.head(20).join(dblp_names.set_index("Node_id"), on="Nodes")[["Author_Name", "times"]]
dblp_h_top = dblp_h.head(20).join(dblp_names.set_index("Node_id"), on="Nodes")[["Author_Name", "times"]]
dblp_b_top = dblp_b.head(20).join(dblp_names.set_index("Node_id"), on="Nodes")[["Author_Name", "times"]]
dblp_hb_top = dblp_hb.head(20).join(dblp_names.set_index("Node_id"), on="Nodes")[["Author_Name", "times"]]

In [39]:
print("\\hline")
print("Node2vec & Fair & Hybrid n2v & Balanced & Hybrid bal. n2v")
print("\\hline")
values = list()
for n, f, h, b, hb in zip(dblp_n2v_top.iterrows(), dblp_fair_top.iterrows(), dblp_h_top.iterrows(), dblp_b_top.iterrows(), dblp_hb_top.iterrows()):
    temp_value = n[1].Author_Name + " (" + str(n[1].times) + ") & "
    temp_value += f[1].Author_Name + " (" + str(f[1].times) + ") & "    
    temp_value += h[1].Author_Name + " (" + str(h[1].times) + ") & "    
    temp_value += b[1].Author_Name + " (" + str(b[1].times) + ") & "    
    temp_value += hb[1].Author_Name + " (" + str(hb[1].times) + ") "
    values.append(temp_value)

for row in values:
    print(row + "\\\\")
    print("\\hline")

\hline
Node2vec & Fair & Hybrid n2v & Balanced & Hybrid bal. n2v
\hline
Douglas W. Oard (718) & Elena Ferrari (327) & Elke A. Rundensteiner (554) & Elena Ferrari (228) & Elke A. Rundensteiner (383) \\
\hline
Wolfgang Lehner (638) & Barbara Carminati (277) & Elena Baralis (497) & Elena Baralis (190) & Elena Baralis (354) \\
\hline
Karl Aberer (610) & Elena Baralis (274) & Ana Paula Appel (372) & Barbara Carminati (185) & Ana Paula Appel (257) \\
\hline
Maarten de Rijke (597) & Bahar Ghadiri Bashardoost (252) & Elena Ferrari (347) & Bahar Ghadiri Bashardoost (168) & Elena Ferrari (241) \\
\hline
Clement T. Yu (588) & Kelly Lyons (239) & Elisa Bertino (311) & Marina Buzzi (159) & Elisa Bertino (236) \\
\hline
Volker Markl (536) & Marina Buzzi (228) & Lois M. L. Delcambre (299) & Kelly Lyons (157) & Isabel F. Cruz (211) \\
\hline
Amit P. Sheth (529) & Christina Christodoulakis (217) & Silvana Castano (298) & Christina Christodoulakis (151) & Shashi Shekhar (205) \\
\hline
Martin Ester (487

In [34]:
for index, row in dblp_h_top.iterrows():
    print(row.Author_Name + str(row.times))

Elke A. Rundensteiner554
Elena Baralis497
Ana Paula Appel372
Elena Ferrari347
Elisa Bertino311
Lois M. L. Delcambre299
Silvana Castano298
Shashi Shekhar294
Barbara Carminati277
Sonia Bergamaschi277
Isabel F. Cruz275
Claudia Plant218
Barbara Leporini210
Barbara Catania194
Melanie Herschel184
Angela Bonifati181
Christina Christodoulakis168
Tova Milo164
Bhavani M. Thuraisingham148
Noriko Kando147
