# Part 1

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx

In [2]:
a_q=pd.read_csv('a2q',sep=' ', names=['u','v','t'])  #answers to questions
c_a=pd.read_csv('c2a',sep=' ', names=['u','v','t'])  #comments to answers
c_q=pd.read_csv('c2q',sep=' ', names=['u','v','t'])  #comments to questions

In [3]:
print(a_q.shape,c_a.shape,c_q.shape)

(17823525, 3) (25405374, 3) (20268151, 3)


In [4]:
#change time to hours
a_q['t'] = a_q['t'].div(3600)
c_a['t'] = c_a['t'].div(3600)
c_q['t'] = c_q['t'].div(3600)

#add type column to dataframes
a_q['type'] = 'a_q'
c_a['type'] = 'c_a'
c_q['type'] = 'c_q'

#add weights column
a_q['weight'] = 1
c_a['weight'] = 1
c_q['weight'] = 1

#sort by time
a_q=a_q.sort_values('t')
c_a=c_a.sort_values('t')
c_q=c_q.sort_values('t')

In [5]:
def time_maxmin(data,min,max): #Retrieve maximum and minimum timestamps
    m = data['t'].min()
    M = data['t'].max()
    if m < min:
        min = m 
    if M > max:
        max = M
    return min, max

In [6]:
def create_graph(data1,time_window,self_loops=0):
    #remove rows with u=v
    if self_loops == 0: 
        data1 = data1[data1['u'] != data1['v']]
    
    #remove rows not having time 't' in given time interval
    data1 = data1[data1['t'].between(time_window[0], time_window[1])]
    
    #assign weight --> 1 for every edge ,+1 every time u,v (interact)
    D = data1[['u','v']]
    D = D[D.duplicated(keep=False)]
    D = D.groupby(list(D)).apply(lambda x: tuple(x.index)).tolist()
    for d in D:
        ind = list(d)
        val = len(ind)
        drop = ind[:-1]
        data1.at[ind[-1], 'weight'] = val
        data1 = data1.drop(index = drop)
    
    G = nx.from_pandas_edgelist(data1, 'u', 'v', ['t', 'type', 'weight'], create_using = nx.MultiDiGraph())
    return G

In [7]:
m = 10**(10)
M = 0
m, M = time_maxmin(a_q, m, M)
m, M = time_maxmin(c_a, m, M)
m, M = time_maxmin(c_q, m, M)
print(m, M)

338213.2991666667 404798.1744444444


In [8]:
#obviously choose a time window that's inside (m,M)
time_window = (m, m+2000)
A = create_graph(a_q, time_window)
B = create_graph(c_a, time_window)
C = create_graph(c_q, time_window)

In [9]:
#merge the graphs
G = nx.compose_all([A, B, C])

# Part 2

## function 1 

It takes in input:

    One of the 3 graphs

The output should return:

    Whether the graph is directed or not
    Number of users
    Number of answers/comments
    Average number of links per user
    Density degree of the graph
    Whether the graph is sparse or dense


In [10]:
def getFeatures(G):
    feats=[] 
    #m = G.number_of_edges() = n_interactions
    #n = sum(G.nodes) = n_users
    
    direct = False
    n_interactions = 0
    check_dir = {}
    users=[]
    for u,v,attr in (G.edges(data=True)):
        if ((u,v) or (v,u)) not in check_dir:
            check_dir[(u,v)]=0
        else:
            direct=True
        
        if u not in users: users.append(u)
        if v not in users: users.append(v)

        n_interactions += 1
    
    n_users = len(users)
     
    avg_links = 0
    density = 0
    g = None
    if n_users>0: 
        avg_links = n_interactions/n_users
        
        if direct:
            density = 2*n_interactions/(n_users*(n_users-1))
        else:
            density = n_interactions/(n_users*(n_users-1))
                                
        if density>0.5:
            g='dense'
        else:
            g='sparse'
        
    feats.extend((direct, n_users, n_interactions, avg_links, density, g))
    
    return feats

In [11]:
getFeatures(A)

[False, 12605, 101675, 8.066243554145181, 0.0006399748932200238, 'sparse']

## function 2 

It takes in input:

    A user/node
    An interval of time
    One of the following metrics: Betweeness 1, PageRank, ClosenessCentrality 3, DegreeCentrality

The output should return:

    The value of the given metric applied over the complete graph for the given interval of time

Give an explanaition regarding the features of the user based on all of the metrics (e.g. if the betweeness metric is high, what does this mean in practice, what if the betweeness is low but it has a high PageRank value, etc.)

In [12]:
#costruisco funzione per grafi in cui è gia stato preso in considerazione l'intervallo di tempo
def dist(G, u, metric, time_window): 
    values = getFeatures(G)
    n = values[1]
    
    if metric == 'Betweeness':
        return 
    
    elif metric == 'PageRank':
        return 
    
    elif metric == 'Closeness':
        return #nx.closeness_centrality(G, u=int(u), wf_improved=False)
    
    elif metric == 'Degree':
        return (G.degree(int(u))/(n-1)
    
    else:
        raise('Metric not allowed')

SyntaxError: invalid syntax (3398047580.py, line 18)