# Embeddings and User - User Similarities

### Import Packages 📦

In [29]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import networkx as nx
import random
import matplotlib.pyplot as plt

%pip install node2vec
from node2vec import Node2Vec

%pip install gensim
import gensim.downloader
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Select the event to be analyzed, and import the graph 

In [30]:
chosen_event = 'earthquake'

# path = f'C:/Users/Utilisateur/Documents/M2/Web Mining/Projet/subgraph/{chosen_event}_subgraph.graphml'
path = f'subgraphs_data/{chosen_event}_subgraph.graphml'
g = nx.read_graphml(path)

### 2.1 - Sampling N users

In [31]:
users = []
nodes_users = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':User':
                nodes_users.append(node)
                users.append({node : data})

#We choose the size of the sample
n=100

# Sample n/2 users with highest degree centrality
degree_centralities = nx.degree_centrality(g) #We first compute the degree coef fo all nodes
degree_filtred = {key: value for key, value in degree_centralities.items() if key in nodes_users} #We then only select the users nodes
top_degree_users = sorted(degree_filtred, key=lambda x: degree_filtred[x], reverse=True)[0:int(n/2)] #We take the 50 users that have the highest degree centrality


filtered_nodes = [node for node in nodes_users if node not in top_degree_users] #We create a list of all the users except the ones already in the top 50 of degree centrality

# Sample n/2 users with highest closeness centrality (taking away the nodes that are already in the highest degree)
closeness_centralities = nx.closeness_centrality(g) #We first compute the closeness coef fo all nodes
closeness_filtred = {key: value for key, value in closeness_centralities.items() if key in filtered_nodes} #We then only select the users nodes (that are not in the top 50 degree centrality)
top_closeness_users = sorted(closeness_filtred, key=lambda x: closeness_filtred[x], reverse=True)[0:int(n/2)]  #We take the 50 users that have the highest closeness centrality

# List of the selected users
list_user = list(set(top_degree_users + top_closeness_users))

users_sample = []

for node_dict in users:
    node_key = list(node_dict.keys())[0]  
    if node_key in list_user:  
        users_sample.append({node_key: node_dict[node_key]})

# exemple of user 
users_sample[0]

{'n96392': {'labels': ':User',
  'isVerified': True,
  'followers_count': 41496,
  'listed_count': 1098,
  'statuses_count': 6046,
  'favourites_count': 292,
  'id': '105153029',
  'screen_name': 'CNNImpact',
  'friends_count': 420,
  'name': 'Impact Your World',
  'tweets_count': 1}}

## 2.ii.a - Embeddings on Graph Structure

In [32]:
# first, we need the associated nodes ids to the sample we definded : 
node_sample_ids = []
for i in range(len(users_sample)):
    for key, _ in users_sample[i].items():
        node_sample_ids.append(key)

# define the subgraph with the sample of 100 nodes
sample_graph = g.subgraph(node_sample_ids)

In [33]:
# we can then build the embeddings of the sampled graph with Node2Vec
node2vec = Node2Vec(sample_graph, dimensions=50)
fitted_model_n2v = node2vec.fit(window=10, min_count=1) 

Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 10791.43it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 158.75it/s]




We can have access to the 10 most similar nodes of a node, based on cosine similarity

In [34]:
fitted_model_n2v.wv.most_similar('n96392')

[('n75904', 0.9822466969490051),
 ('n70567', 0.9814179539680481),
 ('n63697', 0.9792734384536743),
 ('n96427', 0.979026198387146),
 ('n89546', 0.978333055973053),
 ('n96452', 0.9771556854248047),
 ('n80314', 0.9757769703865051),
 ('n96155', 0.9748070240020752),
 ('n58317', 0.9738568663597107),
 ('n96126', 0.9738232493400574)]

Or we can build a cosine similarity matrix :

In [35]:
# get the embeddings for every node
node_embeddings_n2v = {node: fitted_model_n2v.wv[node] for node in sample_graph.nodes()}

# list element is easier to handle
list_of_embeddings_n2v = []
for key, value in node_embeddings_n2v.items():
    list_of_embeddings_n2v.append({key : value})

In [36]:
# build the cosine similarity matrix
cos_sim_n2v = [[0 for _ in range(100)] for _ in range(100)]

for i, emb_i_dict in enumerate(list_of_embeddings_n2v):
    for j, emb_j_dict in enumerate(list_of_embeddings_n2v):
        emb_i = next(iter(emb_i_dict.values())) 
        emb_j = next(iter(emb_j_dict.values()))

        cosine_sim = cosine_similarity([emb_i], [emb_j])[0][0]
        cos_sim_n2v[i][j] = cosine_sim

In [37]:
# from this, we can get the more similar users in our sample graph :
arr_cos_sim_n2v = np.array(cos_sim_n2v)
np.fill_diagonal(arr_cos_sim_n2v, -np.inf) # we replace the 1 elements of the diagonal by -inf

# 10 most similar nodes
v = []

nb_users_to_print = 10
for _ in range(nb_users_to_print):
    max_index = np.argmax(arr_cos_sim_n2v)
    max_sim = np.max(arr_cos_sim_n2v)

    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim_n2v).shape)

    arr_cos_sim_n2v[max_row_index][max_col_index] = -np.inf
    arr_cos_sim_n2v[max_col_index][max_row_index] = -np.inf

    v.append((max_row_index, max_col_index, max_sim))

print(f"The {nb_users_to_print} most similar pairs of users are :")
v

The 10 most similar pairs of users are :


[(28, 60, 0.997758),
 (28, 32, 0.9970872),
 (32, 60, 0.99693966),
 (2, 28, 0.9957464),
 (54, 76, 0.995324),
 (20, 28, 0.99503094),
 (2, 60, 0.9949315),
 (2, 20, 0.9948659),
 (2, 32, 0.9945479),
 (20, 60, 0.99374366)]

## 2.ii.b - Embeddings on Post Content

In [38]:
# we already have our sampled graph and the users, we now need the tweets
tweets = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':Tweet':
                tweets.append({node : data})

# exemple of tweet
tweets[0]

{'n21102': {'labels': ':Tweet',
  'is_quote_status': True,
  'possibly_sensitive': False,
  'retweet_count': 0,
  'favorite_count': 0,
  'id_str': '592834972265934849',
  'isTruncated': False,
  'annotation_postPriority': 'Low',
  'created_at': '2015-04-27T00:00Z',
  'id': '592834972265934849',
  'annotation_annotated': True,
  'annotation_num_judgements': 1,
  'text': 'clever... https://t.co/kl6DVrZX0w',
  'topic': 'TRECIS-CTIT-H-019'}}

In [39]:
# build a list containing dictionnaries with the user_node_id and every tweet made by this user
users_posts = []

for u in users_sample:
    user_node_id = [key for key, _ in u.items()][0]
    tweets_by_user = []
    
    for t in tweets:
        tweet_node_id = [key for key, _ in t.items()][0]
        text_tweet = [value for _, value in t.items()][0]['text']

        if tweet_node_id in g[user_node_id]:
            tweets_by_user.append(text_tweet)

    users_posts.append({'user':user_node_id,
                        'tweets':tweets_by_user})
    
# exemple of users posts :
users_posts[:2]

[{'user': 'n96392',
  'tweets': ['These @USAID search and rescue dogs are heading to #Nepal. How you can help #NepalQuakeRelief: http://t.co/XgKnjvvNup http://t.co/JoPISpNIr4']},
 {'user': 'n64670',
  'tweets': ['PDRRMC said that eight bodies had already been recovered from the debris of the supermarket that was torn down by the earthquake in Pampanga, while at least 20 others had been dug up alive but with injuries, including a woman whose leg had to be amputated. | via Ding Cervantes https://t.co/no0xTEAgCx',
   'The Philippine Institute of Volcanology and Seismology dismissed rumors saying that a magnitude 7.1 earthquake may hit Metro Manila soon. https://t.co/XifnFpQhOy',
   'The entire Pampanga province is now in a state of calamity amid ruins and deaths caused by the 6.1 magnitude earthquake on Monday afternoon. | via Ding Cervantes https://t.co/TWz49BYz9A',
   'The Department of Education has ordered a thorough inspection of all school buildings and facilities after a magnitude 

In [40]:
# let us build the empbeddings of our tweets, using a pretrained Word2Vec model on twitter data, using embeddings of length 50 to match the graph structure embeddings
w2v = gensim.downloader.load('glove-twitter-50')

In [41]:
# we define a function that allows us to tokenize a tweet/sentence, and we take the average of each embedding
def sentence_embedding(sentence):
    if len(sentence)>0:
        tokens = sentence.split()
        embeddings = [w2v[token] for token in tokens if token in w2v]
        
        if len(tokens) > 0 and embeddings:
                avg_embedding = np.mean(embeddings, axis=0)
        else:
            avg_embedding = np.zeros(w2v.vector_size)
    else:
         avg_embedding = np.zeros(w2v.vector_size)
    return avg_embedding

In [42]:
# we then compute and add the embedded tweets to the users_posts list, by adding a key to each dictionnary in the list
for i in users_posts:
    tweets = i['tweets']
    embedded_tweets = []

    if len(tweets) == 0:
        avg_emb_tweet = np.zeros(w2v.vector_size)

    else :
        for j in i['tweets']:
            emb_tweet = sentence_embedding(j)
            embedded_tweets.append(emb_tweet)
            avg_emb_tweet = np.mean(embedded_tweets, axis=0)
    i['embedded_tweets'] = avg_emb_tweet

In [43]:
# exemple of new user post :
users_posts[0]

{'user': 'n96392',
 'tweets': ['These @USAID search and rescue dogs are heading to #Nepal. How you can help #NepalQuakeRelief: http://t.co/XgKnjvvNup http://t.co/JoPISpNIr4'],
 'embedded_tweets': array([ 1.0964541e-01,  3.8855222e-01, -4.4583836e-01, -4.6119708e-01,
        -2.1502979e-01, -1.4820620e-01,  8.6893910e-01, -1.5973497e-02,
        -3.8214959e-03, -3.9347604e-01, -1.0693520e-01, -3.1848807e-02,
        -4.8738604e+00, -8.9479998e-02,  2.5382513e-02, -3.4874976e-03,
        -8.6597502e-02, -5.6521899e-01, -1.8982593e-02, -4.0906811e-01,
         1.3463980e-01, -1.3434881e-01, -2.9168600e-01,  3.1854680e-01,
        -2.5954548e-01,  4.4522700e-01,  2.2805188e-02,  4.2455134e-01,
         6.7810601e-01,  9.6944995e-02,  1.9368540e-01, -2.1784198e-02,
        -6.7391307e-03,  1.2858097e-01,  4.7495198e-01, -1.7855896e-01,
         1.8710601e-01,  1.2585229e-01, -1.7249830e-01, -4.7245007e-02,
        -6.7430609e-01, -5.6812032e-03,  1.1368439e-01,  1.7586289e-01,
         4.17

In [44]:
# we can now compute the cosine similarity matrix like with our graph structure embeddings
cos_sim_w2v = [[0 for _ in range(100)] for _ in range(100)]

for i in range(len(users_posts)):
    for j in range(len(users_posts)):
        if i == j:
            cos_sim_w2v[i][j] = 1
        else:
            emb_i = users_posts[i]['embedded_tweets'].reshape(1, -1)
            emb_j = users_posts[j]['embedded_tweets'].reshape(1, -1)
            cos_sim_w2v[i][j] = cosine_similarity(emb_i, emb_j)[0][0]

In [45]:
# finally, we can print the most similar users based on this matrix
arr_cos_sim_w2v = np.array(cos_sim_w2v)
np.fill_diagonal(arr_cos_sim_w2v, -np.inf) # replace the diagonal full of 1s by -inf 

v = []

nb_users_to_print = 10
for _ in range(nb_users_to_print):
    max_index = np.argmax(arr_cos_sim_w2v)
    max_sim = np.max(arr_cos_sim_w2v)

    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim_w2v).shape)

    arr_cos_sim_w2v[max_row_index][max_col_index] = -np.inf
    arr_cos_sim_w2v[max_col_index][max_row_index] = -np.inf
    
    v.append((max_row_index, max_col_index, max_sim))

print(f"The {nb_users_to_print} more similar pairs of users are :")
v

The 10 more similar pairs of users are :


[(1, 25, 0.9969907402992249),
 (1, 32, 0.9966518878936768),
 (25, 32, 0.996433436870575),
 (63, 70, 0.9958035945892334),
 (32, 70, 0.9955213069915771),
 (8, 62, 0.9954821693926238),
 (1, 8, 0.9953297365470238),
 (12, 25, 0.9948004484176636),
 (1, 62, 0.9947628974914551),
 (25, 62, 0.9947424530982971)]

In [46]:
# let us look at users 71 and 92 posts (post 57 is very long so bit hard to analyze)
print(users_posts[4]['tweets'])
print()
print(users_posts[5]['tweets'])

['Nepal Govt thanks @PMOIndia for going all out to help with post-earthquake story https://t.co/xEBdCeaCLJ', 'Starts to rain in Kathmandu. Rescue operations affected. Officials fear wet bodies will start rotting, stinking. Fear of mass epidemic.', 'Nepal begins 3 days of mourning to condole loss of lives caused by earthquake. Flag at half mast across nation. PM Koirala thanks @PMOIndia', 'The fearless Gorkha. Respect. http://t.co/KcMwZIIw8p']

['Reduced cost of calls to #Nepal to 1�/min (from 19�/min) to help loved ones connect? http://t.co/RjfHY8bCri @GoogleCR http://t.co/fWEb70Pp8P', 'People worldwide are looking for ways to help #Nepal. Searches visualized: http://t.co/dWBufW30yP via @GoogleTrends http://t.co/ZkD7lycTnb', 'Updated satellite imagery of #Nepal. Here, a busy intersection and temple, before and after ? http://t.co/RjfHY8bCri http://t.co/nyY3b1ADy9']


For the earthquake subgraph for example, it seems that both tweets are talking mainly about events that happened in Nepal

## 2.iii - Trends in Correlations

In [47]:
# Let us import a pretrained Word2Vec model that we will use in the looped following function 
w2v = gensim.downloader.load('glove-twitter-50')

def similarity_matrix(selected_graph):

    path = f'subgraphs_data/{chosen_event}_subgraph.graphml'
    g = nx.read_graphml(path)

    users = []
    nodes_users = []
    for node, data in g.nodes(data=True):
        for key, value in data.items():
            if key == 'labels':
                if value == ':User':
                    nodes_users.append(node)
                    users.append({node : data})

    #We choose the size of the sample
    n=100

    # Sample n/2 users with highest degree centrality
    degree_centralities = nx.degree_centrality(g) #We first compute the degree coef fo all nodes
    degree_filtred = {key: value for key, value in degree_centralities.items() if key in nodes_users} #We then only select the users nodes
    top_degree_users = sorted(degree_filtred, key=lambda x: degree_filtred[x], reverse=True)[0:int(n/2)] #We take the 50 users that have the highest degree centrality


    filtered_nodes = [node for node in nodes_users if node not in top_degree_users] #We create a list of all the users except the ones already in the top 50 of degree centrality

    # Sample n/2 users with highest closeness centrality (taking away the nodes that are already in the highest degree)
    closeness_centralities = nx.closeness_centrality(g) #We first compute the closeness coef fo all nodes
    closeness_filtred = {key: value for key, value in closeness_centralities.items() if key in filtered_nodes} #We then only select the users nodes (that are not in the top 50 degree centrality)
    top_closeness_users = sorted(closeness_filtred, key=lambda x: closeness_filtred[x], reverse=True)[0:int(n/2)]  #We take the 50 users that have the highest closeness centrality

    # List of the selected users
    list_user = list(set(top_degree_users + top_closeness_users))

    users_sample = []

    for node_dict in users:
        node_key = list(node_dict.keys())[0]  
        if node_key in list_user:  
            users_sample.append({node_key: node_dict[node_key]})


    # first, we need the associated nodes ids to the sample we definded : 
    node_sample_ids = []
    for i in range(len(users_sample)):
        for key, _ in users_sample[i].items():
            node_sample_ids.append(key)

    # define the subgraph with the sample of 100 nodes
    sample_graph = g.subgraph(node_sample_ids)

    # we can then build the embeddings of the sampled graph with Node2Vec
    node2vec = Node2Vec(sample_graph, dimensions=50)
    fitted_model_n2v = node2vec.fit(window=10, min_count=1) 
    
    # get the embeddings for every node
    node_embeddings_n2v = {node: fitted_model_n2v.wv[node] for node in sample_graph.nodes()}

    # list element is easier to handle
    list_of_embeddings_n2v = []
    for key, value in node_embeddings_n2v.items():
        list_of_embeddings_n2v.append({key : value})

    # build the cosine similarity matrix
    cos_sim_n2v = [[0 for _ in range(100)] for _ in range(100)]

    for i, emb_i_dict in enumerate(list_of_embeddings_n2v):
        for j, emb_j_dict in enumerate(list_of_embeddings_n2v):
            emb_i = next(iter(emb_i_dict.values())) 
            emb_j = next(iter(emb_j_dict.values()))

            cosine_sim = cosine_similarity([emb_i], [emb_j])[0][0]
            cos_sim_n2v[i][j] = cosine_sim


    # we already have our sampled graph and the users, we now need the tweets
    tweets = []
    for node, data in g.nodes(data=True):
        for key, value in data.items():
            if key == 'labels':
                if value == ':Tweet':
                    tweets.append({node : data})


    # build a list containing dictionnaries with the user_node_id and every tweet made by this user
    users_posts = []

    for u in users_sample:
        user_node_id = [key for key, _ in u.items()][0]
        tweets_by_user = []
        
        for t in tweets:
            tweet_node_id = [key for key, _ in t.items()][0]
            text_tweet = [value for _, value in t.items()][0]['text']

            if tweet_node_id in g[user_node_id]:
                tweets_by_user.append(text_tweet)

        users_posts.append({'user':user_node_id,
                            'tweets':tweets_by_user})    


    # we then compute and add the embedded tweets to the users_posts list, by adding a key to each dictionnary in the list
    for i in users_posts:
        tweets = i['tweets']
        embedded_tweets = []

        if len(tweets) == 0:
            avg_emb_tweet = np.zeros(w2v.vector_size)

        else :
            for j in i['tweets']:
                emb_tweet = sentence_embedding(j)
                embedded_tweets.append(emb_tweet)
                avg_emb_tweet = np.mean(embedded_tweets, axis=0)
        i['embedded_tweets'] = avg_emb_tweet

    # we can now compute the cosine similarity matrix like with our graph structure embeddings
    cos_sim_w2v = [[0 for _ in range(100)] for _ in range(100)]

    for i in range(len(users_posts)):
        for j in range(len(users_posts)):
            if i == j:
                cos_sim_w2v[i][j] = 1
            else:
                emb_i = users_posts[i]['embedded_tweets'].reshape(1, -1)
                emb_j = users_posts[j]['embedded_tweets'].reshape(1, -1)
                cos_sim_w2v[i][j] = cosine_similarity(emb_i, emb_j)[0][0]
    return([cos_sim_n2v, cos_sim_w2v])

In [48]:
correlation_results = []

events = ['wildfire', 'earthquake', 'typhoon', 'bombing', 'flood', 'shooting']

for event_type in events:

    res = similarity_matrix(event_type)
    
    graph_similarities = [similarity for sublist in res[0] for similarity in sublist]
    content_similarities = [similarity for sublist in res[1] for similarity in sublist]

    # Compute the correlation between the graph structure and tweet content similarities
    correlation, p_value = spearmanr(graph_similarities, content_similarities)
    
    # Append the correlation coefficient and p-value to the results list
    correlation_results.append({'event_type': event_type, 'correlation': correlation, 'p_value': p_value})


Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 13723.92it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 161.24it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 12394.52it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 119.93it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 16804.10it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 200.57it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 16707.71it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 214.65it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 19916.92it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 217.86it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 15819.80it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 219.62it/s]


In [49]:
correlation_results

[{'event_type': 'wildfire',
  'correlation': 0.12185115780741214,
  'p_value': 2.1710614502036913e-34},
 {'event_type': 'earthquake',
  'correlation': 0.19815285054191645,
  'p_value': 4.3690508417981185e-89},
 {'event_type': 'typhoon',
  'correlation': 0.11543529291262557,
  'p_value': 5.155231672948199e-31},
 {'event_type': 'bombing',
  'correlation': 0.15581431825329534,
  'p_value': 2.2313674210928092e-55},
 {'event_type': 'flood',
  'correlation': 0.16812255257340036,
  'p_value': 2.6675657379862692e-64},
 {'event_type': 'shooting',
  'correlation': 0.18783889173316048,
  'p_value': 4.3777891854332924e-80}]

According to this results, we see that the similarity matrices for the graph structure and post contents are more correlated for the flood and shooting event (higher p-value) than for the earthquake and typhoon events. Moreover we see that in general we observe a positive correlation between the two matrices of similarity except for the shooting event.