# Part 1

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx
from collections import Counter
import random
import time

In [2]:
a_q=pd.read_csv('a2q',sep=' ', names=['u','v','t'])  #answers to questions
c_a=pd.read_csv('c2a',sep=' ', names=['u','v','t'])  #comments to answers
c_q=pd.read_csv('c2q',sep=' ', names=['u','v','t'])  #comments to questions

In [3]:
print(a_q.shape,c_a.shape,c_q.shape)

(17823525, 3) (25405374, 3) (20268151, 3)


In [4]:
#change time to hours
a_q['t'] = a_q['t'].div(3600)
c_a['t'] = c_a['t'].div(3600)
c_q['t'] = c_q['t'].div(3600)

#add type column to dataframes
a_q['type'] = 'a_q'
c_a['type'] = 'c_a'
c_q['type'] = 'c_q'

#add weights column
a_q['weight'] = 1
c_a['weight'] = 1
c_q['weight'] = 1

#sort by time
a_q=a_q.sort_values('t')
c_a=c_a.sort_values('t')
c_q=c_q.sort_values('t')

In [5]:
def time_maxmin(data,min,max): #Retrieve maximum and minimum timestamps
    m = data['t'].min()
    M = data['t'].max()
    if m < min:
        min = m 
    if M > max:
        max = M
    return min, max

In [6]:
def create_graph(data1,time_window,self_loops=0):
    #remove rows with u=v
    if self_loops == 0: 
        data1 = data1[data1['u'] != data1['v']]
    
    #remove rows not having time 't' in given time interval
    data1 = data1[data1['t'].between(time_window[0], time_window[1])]
    
    #assign weight --> 1 for every edge ,+1 every time u,v (interact)
    D = data1[['u','v']]
    D = D[D.duplicated(keep=False)]
    D = D.groupby(list(D)).apply(lambda x: tuple(x.index)).tolist()
    for d in D:
        ind = list(d)
        val = len(ind)
        drop = ind[:-1]
        data1.at[ind[-1], 'weight'] = val
        data1 = data1.drop(index = drop)
    
    G = nx.from_pandas_edgelist(data1, 'u', 'v', ['t', 'type', 'weight'], create_using = nx.MultiDiGraph())
    return G

In [7]:
m = 10**(10)
M = 0
m, M = time_maxmin(a_q, m, M)
m, M = time_maxmin(c_a, m, M)
m, M = time_maxmin(c_q, m, M)
print(m, M)

338213.2991666667 404798.1744444444


In [8]:
#obviously choose a time window that's inside (m,M)
time_window = (m, m+1000)
A = create_graph(a_q, time_window)
B = create_graph(c_a, time_window)
C = create_graph(c_q, time_window)

In [9]:
#merge the graphs
G = nx.compose_all([A, B, C])

# Part 2

## function 1 

It takes in input:

    One of the 3 graphs

The output should return:

    Whether the graph is directed or not
    Number of users
    Number of answers/comments
    Average number of links per user
    Density degree of the graph
    Whether the graph is sparse or dense


In [10]:
def getFeatures(G):
    feats=[] 
    #m = G.number_of_edges()    = n_interactions
    #n = sum(G.nodes)           = n_users
    
    direct = False
    n_interactions = 0
    check_dir = {}
    users=[]
    for u,v,attr in (G.edges(data=True)):
        if ((u,v) or (v,u)) not in check_dir:
            check_dir[(u,v)]=0
        else:
            #print(u,v)
            direct=True
        
        if u not in users: users.append(u)
        if v not in users: users.append(v)

        n_interactions += 1
    
    n_users = len(users)
     
    avg_links = 0
    density = 0
    g = None
    if n_users>0: 
        avg_links = n_interactions/n_users
        
        if direct:
            density = 2*n_interactions/(n_users*(n_users-1))
        else:
            density = n_interactions/(n_users*(n_users-1))
                                
        if density>0.5:
            g='dense'
        else:
            g='sparse'
        
    feats.extend((direct, n_users, n_interactions, avg_links, density, g))
    
    return feats

In [11]:
getFeatures(A)

[False, 3172, 21871, 6.895018915510719, 0.002174398901138669, 'sparse']

## function 2 

It takes in input:

    A user/node
    An interval of time
    One of the following metrics: Betweeness 1, PageRank, ClosenessCentrality 3, DegreeCentrality

The output should return:

    The value of the given metric applied over the complete graph for the given interval of time

Give an explanaition regarding the features of the user based on all of the metrics (e.g. if the betweeness metric is high, what does this mean in practice, what if the betweeness is low but it has a high PageRank value, etc.)

In [12]:
#costruisco funzione per grafi in cui è gia stato preso in considerazione l'intervallo di tempo
def dist(G, u, metric, time_window, alpha=0.2): 
    values = getFeatures(G)
    n = values[1]
    
    if metric == 'Betweeness':
        return 
    
    elif metric == 'PageRank':
        return RandomSurfer(G, alpha, T=100)
    
    elif metric == 'Closeness':
        return #nx.closeness_centrality(G, u=int(u), wf_improved=False)
    
    elif metric == 'Degree':
        return (G.degree(int(u))/(n-1)
    
    else:
        raise('Metric not allowed')

SyntaxError: invalid syntax (3398047580.py, line 18)

### PageRank (Random surfer model)

In [13]:
def build_adj_matrix(G):
    n = len(G)
    if n == 0: raise('No nodes found')
    else:
        ones = {}
        
        for u,v in sorted(list(G.edges())):
            ones[(u,v)] = 1 
    return ones

In [14]:
def mapper(G): #return dictionary with nodes as keys and index in amtrix as value
    conv={}
    counter=0
    for u in sorted(G.nodes):
        conv[u]=counter
        counter+=1
    return conv

In [15]:
def no_tp_P(G,mat,mapper):
    keys = list(mat.keys())
    c = Counter(u for u,v in keys)
    for source in sorted(list(G)):
        for (u,v) in keys:
            if source == u and c[source] > 0:
                mat[(u,v)] = 1/c[source]
    keys = list(mat.keys())
    M=np.zeros(shape=(len(G),len(G)))
    for (u,v) in keys:
        value=mat[(u,v)]
        i=mapper[u]
        j=mapper[v]
        M[i][j]=value
    return M

In [80]:
def build_P(M, alpha, n): #size of matrix
    P = (alpha/n * np.ones((n,n))) + ((1-alpha)*(M))   
    return P

In [182]:
def RandomSurfer(G, alpha, T=100): #T=time, aka n_iterations
    n = len(G)
    print('sit back and relax, this is going to take some time')
    mat = build_adj_matrix(G) 
    mapp = mapper(G)
    mat = no_tp_P(G,mat,mapp)
    P = build_P(mat, alpha, n)
    
    start = random.randint(0,n)
    #retrieve node given its index in amtrix
    key_list = list(mapp.keys())
    val_list = list(mapp.values())
    s = key_list[val_list.index(start)] #s=starting node
    
    print(('starting from node {}').format(s))
    
    q0=np.zeros((n,1))
    q0[start]=1
    store_value=[]
    conv=False
    for t in tqdm(range(T)):
        q = np.matmul(q0.T, np.linalg.matrix_power(P, t))
        if (t>0) and (np.array_equal(q, store_value[-1])):
            print(('converged in {} steps').format(t-1))
            q=store_value[-2] #else we are taking a all zeros array
            conv=True
            break
        store_value.append(q)
    if conv == False: print("didn't converge")
    return q

In [183]:
t=time.time()
q3=RandomSurfer(G,0.2)
fin=(time.time()-t)/60
print(('You just wasted {} minutes of your life').format(round(fin,2)))

sit back and relax, this is going to take some time
starting from node 2328


100%|█████████████████████████████████| 100/100 [05:08<00:00,  3.09s/it]

didn't converge
You just wasted 5.23 minutes of your life





In [None]:
store[0]

In [179]:
q3

array([[5.95836229e-07, 5.35250795e-08, 1.41067609e-08, ...,
        1.88336197e-07, 7.95595014e-08, 3.70365674e-08]])

In [180]:
s

1523

In [158]:
trial=a_q.head(100)

In [159]:
trial=trial.sort_values('t')

In [160]:
trial

Unnamed: 0,u,v,t,type,weight
0,9,8,338213.299167,a_q,1
1,1,1,338214.944722,a_q,1
2,13,1,338223.957500,a_q,1
3,17,1,338227.121944,a_q,1
4,48,2,338227.272778,a_q,1
...,...,...,...,...,...
9995,13,1597,338793.893611,a_q,1
9996,2199,1199387,338793.950833,a_q,1
9997,1862,2754,338793.956389,a_q,1
9998,1196,2754,338793.961111,a_q,1


In [161]:
T=create_graph(trial, (338213,338229))

In [162]:
t=time.time()
q3,s,store=RandomSurfer(T,0.2,3200)
fin=(time.time()-t)/60
print(('If you got here you just wasted {} minutes of your life').format(fin))

100%|█████████████████████████████| 3200/3200 [00:00<00:00, 5188.78it/s]

If you got here you just wasted 0.010363380114237467 minutes of your life





In [163]:
q3

array([[6.20667037e-231, 5.79212139e-231, 1.12513186e-230,
        5.79212139e-231, 4.80148694e-231, 5.79212139e-231,
        2.98175453e-231, 2.98175453e-231, 5.79212139e-231,
        4.80148694e-231, 4.38693796e-231, 7.19730483e-231,
        2.98175453e-231, 8.89079022e-230, 2.98175453e-231,
        8.81845210e-230, 2.98175453e-231, 2.98175453e-231,
        2.98175453e-231, 2.98175453e-231, 2.98175453e-231]])