# Embeddings and User - User Similarities

### Import Packages 📦

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import networkx as nx
import random
import matplotlib.pyplot as plt

%pip install node2vec
from node2vec import Node2Vec

%pip install gensim
import gensim.downloader
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Select the event to be analyzed, and import the graph 

In [89]:
chosen_event = 'earthquake'

path = f'C:/Users/Utilisateur/Documents/M2/Web Mining/Projet/subgraph/{chosen_event}_subgraph.graphml'
#path = f'subgraphs_data/{chosen_event}_subgraph.graphml'
g = nx.read_graphml(path)

### 2.1 - Sampling N users

In [90]:
users = []
nodes_users = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':User':
                nodes_users.append(node)
                users.append({node : data})

#We choose the size of the sample
n=100

# Sample n/2 users with highest degree centrality
degree_centralities = nx.degree_centrality(g) #We first compute the degree coef fo all nodes
degree_filtred = {key: value for key, value in degree_centralities.items() if key in nodes_users} #We then only select the users nodes
top_degree_users = sorted(degree_filtred, key=lambda x: degree_filtred[x], reverse=True)[0:int(n/2)] #We take the 50 users that have the highest degree centrality


filtered_nodes = [node for node in nodes_users if node not in top_degree_users] #We create a list of all the users except the ones already in the top 50 of degree centrality

# Sample n/2 users with highest closeness centrality (taking away the nodes that are already in the highest degree)
closeness_centralities = nx.closeness_centrality(g) #We first compute the closeness coef fo all nodes
closeness_filtred = {key: value for key, value in closeness_centralities.items() if key in filtered_nodes} #We then only select the users nodes (that are not in the top 50 degree centrality)
top_closeness_users = sorted(closeness_filtred, key=lambda x: closeness_filtred[x], reverse=True)[0:int(n/2)]  #We take the 50 users that have the highest closeness centrality

# List of the selected users
list_user = list(set(top_degree_users + top_closeness_users))

users_sample = []

for node_dict in users:
    node_key = list(node_dict.keys())[0]  
    if node_key in list_user:  
        users_sample.append({node_key: node_dict[node_key]})

In [96]:
users_sample

[{'n96392': {'labels': ':User',
   'isVerified': True,
   'followers_count': 41496,
   'listed_count': 1098,
   'statuses_count': 6046,
   'favourites_count': 292,
   'id': '105153029',
   'screen_name': 'CNNImpact',
   'friends_count': 420,
   'name': 'Impact Your World',
   'tweets_count': 1}},
 {'n64670': {'labels': ':User',
   'isVerified': True,
   'followers_count': 423445,
   'listed_count': 1458,
   'statuses_count': 362223,
   'favourites_count': 3109,
   'id': '472122299',
   'screen_name': 'PhilippineStar',
   'friends_count': 1392,
   'name': 'The Philippine Star',
   'tweets_count': 35}},
 {'n72356': {'labels': ':User',
   'isVerified': False,
   'followers_count': 13765,
   'listed_count': 416,
   'statuses_count': 6594,
   'favourites_count': 569,
   'id': '127790650',
   'screen_name': 'BBCSanjoyM',
   'friends_count': 709,
   'name': 'Sanjoy Majumder',
   'tweets_count': 2}},
 {'n71359': {'labels': ':User',
   'listed_count': 4450,
   'statuses_count': 304811,
   'favo

In [33]:
# # first, we need access to the list of users in the graph 
# users = []
# for node, data in g.nodes(data=True):
#     for key, value in data.items():
#         if key == 'labels':
#             if value == ':User':
#                 users.append({node : data})

# # then, we select a random sample of 100 users
# random.seed(55)
# N = 100
# users_sample = random.sample(users, N)

In [4]:
# example of user
users_sample[0]

{'n86625': {'labels': ':User',
  'isVerified': False,
  'followers_count': 129943,
  'listed_count': 742,
  'statuses_count': 1970,
  'favourites_count': 2,
  'id': '302187548',
  'screen_name': 'AB_EmergAlert',
  'friends_count': 1,
  'name': 'AB Emergencyalert',
  'tweets_count': 19}}

## 2.ii.a - Embeddings on Graph Structure

In [93]:
# first, we need the associated nodes ids to the sample we definded : 
node_sample_ids = []
for i in range(len(users_sample)):
    for key, _ in users_sample[i].items():
        node_sample_ids.append(key)

# define the subgraph with the sample of 100 nodes
sample_graph = g.subgraph(node_sample_ids)

In [94]:
# we can then build the embeddings of the sampled graph with Node2Vec
node2vec = Node2Vec(sample_graph, dimensions=50)
fitted_model_n2v = node2vec.fit(window=10, min_count=1) 

Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 10030.38it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 125.99it/s]


We can have access to the 10 most similar nodes of a node, based on cosine similarity

In [97]:
fitted_model_n2v.wv.most_similar('n96392')

[('n83848', 0.987080454826355),
 ('n96278', 0.9857897758483887),
 ('n63697', 0.9850401878356934),
 ('n96528', 0.9841932058334351),
 ('n70567', 0.9830182790756226),
 ('n96118', 0.9826561808586121),
 ('n96452', 0.9819634556770325),
 ('n88565', 0.9795251488685608),
 ('n95162', 0.9795204401016235),
 ('n64463', 0.9787771105766296)]

Or we can build a cosine similarity matrix :

In [98]:
# get the embeddings for every node
node_embeddings_n2v = {node: fitted_model_n2v.wv[node] for node in sample_graph.nodes()}

# list element is easier to handle
list_of_embeddings_n2v = []
for key, value in node_embeddings_n2v.items():
    list_of_embeddings_n2v.append({key : value})

In [99]:
# build the cosine similarity matrix
cos_sim_n2v = [[0 for _ in range(100)] for _ in range(100)]

for i, emb_i_dict in enumerate(list_of_embeddings_n2v):
    for j, emb_j_dict in enumerate(list_of_embeddings_n2v):
        emb_i = next(iter(emb_i_dict.values())) 
        emb_j = next(iter(emb_j_dict.values()))

        cosine_sim = cosine_similarity([emb_i], [emb_j])[0][0]
        cos_sim_n2v[i][j] = cosine_sim

In [100]:
# from this, we can get the more similar users in our sample graph :
arr_cos_sim_n2v = np.array(cos_sim_n2v)
np.fill_diagonal(arr_cos_sim_n2v, -np.inf) # we replace the 1 elements of the diagonal by -inf

# 10 most similar nodes
v = []

nb_users_to_print = 10
for _ in range(nb_users_to_print):
    max_index = np.argmax(arr_cos_sim_n2v)
    max_sim = np.max(arr_cos_sim_n2v)

    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim_n2v).shape)

    arr_cos_sim_n2v[max_row_index][max_col_index] = -np.inf
    arr_cos_sim_n2v[max_col_index][max_row_index] = -np.inf

    v.append((max_row_index, max_col_index, max_sim))

print(f"The {nb_users_to_print} most similar pairs of users are :")
v

The 10 most similar pairs of users are :


[(22, 48, 0.9975747),
 (22, 52, 0.99730986),
 (48, 52, 0.9972682),
 (79, 88, 0.9955195),
 (52, 88, 0.9939079),
 (22, 79, 0.99329036),
 (52, 79, 0.99305165),
 (22, 88, 0.99247557),
 (60, 82, 0.99186194),
 (48, 88, 0.9914471)]

## 2.ii.b - Embeddings on Post Content

In [101]:
# we already have our sampled graph and the users, we now need the tweets
tweets = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':Tweet':
                tweets.append({node : data})

# exemple of tweet
tweets[0]

{'n21102': {'labels': ':Tweet',
  'is_quote_status': True,
  'possibly_sensitive': False,
  'retweet_count': 0,
  'favorite_count': 0,
  'id_str': '592834972265934849',
  'isTruncated': False,
  'annotation_postPriority': 'Low',
  'created_at': '2015-04-27T00:00Z',
  'id': '592834972265934849',
  'annotation_annotated': True,
  'annotation_num_judgements': 1,
  'text': 'clever... https://t.co/kl6DVrZX0w',
  'topic': 'TRECIS-CTIT-H-019'}}

In [102]:
# build a list containing dictionnaries with the user_node_id and every tweet made by this user
users_posts = []

for u in users_sample:
    user_node_id = [key for key, _ in u.items()][0]
    tweets_by_user = []
    
    for t in tweets:
        tweet_node_id = [key for key, _ in t.items()][0]
        text_tweet = [value for _, value in t.items()][0]['text']

        if tweet_node_id in g[user_node_id]:
            tweets_by_user.append(text_tweet)

    users_posts.append({'user':user_node_id,
                        'tweets':tweets_by_user})
    
# exemple of users posts :
users_posts[:2]

[{'user': 'n96392',
  'tweets': ['These @USAID search and rescue dogs are heading to #Nepal. How you can help #NepalQuakeRelief: http://t.co/XgKnjvvNup http://t.co/JoPISpNIr4']},
 {'user': 'n64670',
  'tweets': ['PDRRMC said that eight bodies had already been recovered from the debris of the supermarket that was torn down by the earthquake in Pampanga, while at least 20 others had been dug up alive but with injuries, including a woman whose leg had to be amputated. | via Ding Cervantes https://t.co/no0xTEAgCx',
   'The Philippine Institute of Volcanology and Seismology dismissed rumors saying that a magnitude 7.1 earthquake may hit Metro Manila soon. https://t.co/XifnFpQhOy',
   'The entire Pampanga province is now in a state of calamity amid ruins and deaths caused by the 6.1 magnitude earthquake on Monday afternoon. | via Ding Cervantes https://t.co/TWz49BYz9A',
   'The Department of Education has ordered a thorough inspection of all school buildings and facilities after a magnitude 

In [103]:
# let us build the empbeddings of our tweets, using a pretrained Word2Vec model on twitter data, using embeddings of length 50 to match the graph structure embeddings
w2v = gensim.downloader.load('glove-twitter-50')

In [105]:
# we define a function that allows us to tokenize a tweet/sentence, and we take the average of each embedding
def sentence_embedding(sentence):

    tokens = sentence.split()
    embeddings = [w2v[token] for token in tokens if token in w2v]
    
    if embeddings:
            avg_embedding = np.mean(embeddings, axis=0)
    else:
        avg_embedding = np.zeros(w2v.vector_size)

    return avg_embedding

In [106]:
# we define a function that allows us to tokenize a tweet/sentence, and we take the average of each embedding
def sentence_embedding(sentence):
    if len(sentence)>0:
        tokens = sentence.split()
        embeddings = [w2v[token] for token in tokens if token in w2v]
        
        if len(tokens) > 0 and embeddings:
                avg_embedding = np.mean(embeddings, axis=0)
        else:
            avg_embedding = np.zeros(w2v.vector_size)
    else:
         avg_embedding = np.zeros(w2v.vector_size)
    return avg_embedding

In [107]:
# we then compute and add the embedded tweets to the users_posts list, by adding a key to each dictionnary in the list
for i in users_posts:
    tweets = i['tweets']
    embedded_tweets = []

    if len(tweets) == 0:
        avg_emb_tweet = np.zeros(w2v.vector_size)

    else :
        for j in i['tweets']:
            emb_tweet = sentence_embedding(j)
            embedded_tweets.append(emb_tweet)
            avg_emb_tweet = np.mean(embedded_tweets, axis=0)
    i['embedded_tweets'] = avg_emb_tweet

In [42]:
# exemple of new user post :
users_posts[0]

{'user': 'n86625',
 'tweets': ['Wildfire Alert Updated May30 1141AM Take necessary precautions. Lesser Slave River... https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert Ended May21 942AM Yellowhead County https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert Updated May29 1256PM Take necessary precautions. Mackenzie #23 https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert May30 1219PM Take necessary precautions. Lesser Slave River #124 https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert Updated May31 814AM Take necessary precautions. Peerless Lake First... https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert Updated May02 337AM Take necessary precautions. Regional Municipality of... https://t.co/B6uO3E9qa7 #ABfire #ABemerg',
  'Wildfire Alert Updated May03 301PM Take necessary precautions. Regional Municipality of... https://t.co/B6uO3E9qa7 #ABfire #ABemerg',
  'Wildfire Alert Updated May03 234PM Take necessary precautions. Regional Municipalit

In [108]:
# we can now compute the cosine similarity matrix like with our graph structure embeddings
cos_sim_w2v = [[0 for _ in range(100)] for _ in range(100)]

for i in range(len(users_posts)):
    for j in range(len(users_posts)):
        if i == j:
            cos_sim_w2v[i][j] = 1
        else:
            emb_i = users_posts[i]['embedded_tweets'].reshape(1, -1)
            emb_j = users_posts[j]['embedded_tweets'].reshape(1, -1)
            cos_sim_w2v[i][j] = cosine_similarity(emb_i, emb_j)[0][0]

In [109]:
# finally, we can print the most similar users based on this matrix
arr_cos_sim_w2v = np.array(cos_sim_w2v)
np.fill_diagonal(arr_cos_sim_w2v, -np.inf) # replace the diagonal full of 1s by -inf 

v = []

nb_users_to_print = 10
for _ in range(nb_users_to_print):
    max_index = np.argmax(arr_cos_sim_w2v)
    max_sim = np.max(arr_cos_sim_w2v)

    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim_w2v).shape)

    arr_cos_sim_w2v[max_row_index][max_col_index] = -np.inf
    arr_cos_sim_w2v[max_col_index][max_row_index] = -np.inf
    
    v.append((max_row_index, max_col_index, max_sim))

print(f"The {nb_users_to_print} more similar pairs of users are :")
v

The 10 more similar pairs of users are :


[(1, 25, 0.996990442276001),
 (1, 32, 0.9966517090797424),
 (25, 32, 0.9964333176612854),
 (63, 70, 0.9958032965660095),
 (32, 70, 0.9955211281776428),
 (8, 62, 0.9954821693926239),
 (1, 8, 0.9953297365470242),
 (12, 25, 0.994800329208374),
 (1, 62, 0.9947625994682312),
 (25, 62, 0.994742214679718)]

In [45]:
# let us look at users 71 and 92 posts (post 57 is very long so bit hard to analyze)
print(users_posts[4]['tweets'])
print()
print(users_posts[5]['tweets'])

["You'll have to put a damper on any firework plans this long weekend... Just a reminder that a fire ban is in place through the #RMWB #ymm \\n\\nhttps://t.co/yKNgQo1HOK", 'As we enter the long weekend, the province is issuing a fire ban and off-highway vehicle restrictions for most of northern Alberta. #ymm #rmwb https://t.co/yKNgQnK6qa', 'UPDATE: Next wildfire media conference to take place at MacDonald Island Park at 5:30 p.m. https://t.co/c5bxj9kHqh #ymm #rmwb @Mix1037radio']

['"Jenna Rizkalla was on fire today."\\n\\nA 9.9 on vault &amp; floor plus a team-high 9.750 on beam for the senior at New Ha? https://t.co/xah4f1uIlL']


For the wildifre subgraph for example, it seems that both tweets are talking mainly about air quality specifically !

## 2.iii - Trends in Correlations

In [112]:
def similarity_matrix(selected_graph):

    path = f'C:/Users/Utilisateur/Documents/M2/Web Mining/Projet/subgraph/{selected_graph}_subgraph.graphml'
    g = nx.read_graphml(path)

    users = []
    nodes_users = []
    for node, data in g.nodes(data=True):
        for key, value in data.items():
            if key == 'labels':
                if value == ':User':
                    nodes_users.append(node)
                    users.append({node : data})

    #We choose the size of the sample
    n=100

    # Sample n/2 users with highest degree centrality
    degree_centralities = nx.degree_centrality(g) #We first compute the degree coef fo all nodes
    degree_filtred = {key: value for key, value in degree_centralities.items() if key in nodes_users} #We then only select the users nodes
    top_degree_users = sorted(degree_filtred, key=lambda x: degree_filtred[x], reverse=True)[0:int(n/2)] #We take the 50 users that have the highest degree centrality


    filtered_nodes = [node for node in nodes_users if node not in top_degree_users] #We create a list of all the users except the ones already in the top 50 of degree centrality

    # Sample n/2 users with highest closeness centrality (taking away the nodes that are already in the highest degree)
    closeness_centralities = nx.closeness_centrality(g) #We first compute the closeness coef fo all nodes
    closeness_filtred = {key: value for key, value in closeness_centralities.items() if key in filtered_nodes} #We then only select the users nodes (that are not in the top 50 degree centrality)
    top_closeness_users = sorted(closeness_filtred, key=lambda x: closeness_filtred[x], reverse=True)[0:int(n/2)]  #We take the 50 users that have the highest closeness centrality

    # List of the selected users
    list_user = list(set(top_degree_users + top_closeness_users))

    users_sample = []

    for node_dict in users:
        node_key = list(node_dict.keys())[0]  
        if node_key in list_user:  
            users_sample.append({node_key: node_dict[node_key]})


    # first, we need the associated nodes ids to the sample we definded : 
    node_sample_ids = []
    for i in range(len(users_sample)):
        for key, _ in users_sample[i].items():
            node_sample_ids.append(key)

    # define the subgraph with the sample of 100 nodes
    sample_graph = g.subgraph(node_sample_ids)

    # we can then build the embeddings of the sampled graph with Node2Vec
    node2vec = Node2Vec(sample_graph, dimensions=50)
    fitted_model_n2v = node2vec.fit(window=10, min_count=1) 
    
    # get the embeddings for every node
    node_embeddings_n2v = {node: fitted_model_n2v.wv[node] for node in sample_graph.nodes()}

    # list element is easier to handle
    list_of_embeddings_n2v = []
    for key, value in node_embeddings_n2v.items():
        list_of_embeddings_n2v.append({key : value})

    # build the cosine similarity matrix
    cos_sim_n2v = [[0 for _ in range(100)] for _ in range(100)]

    for i, emb_i_dict in enumerate(list_of_embeddings_n2v):
        for j, emb_j_dict in enumerate(list_of_embeddings_n2v):
            emb_i = next(iter(emb_i_dict.values())) 
            emb_j = next(iter(emb_j_dict.values()))

            cosine_sim = cosine_similarity([emb_i], [emb_j])[0][0]
            cos_sim_n2v[i][j] = cosine_sim


    # we already have our sampled graph and the users, we now need the tweets
    tweets = []
    for node, data in g.nodes(data=True):
        for key, value in data.items():
            if key == 'labels':
                if value == ':Tweet':
                    tweets.append({node : data})


    # build a list containing dictionnaries with the user_node_id and every tweet made by this user
    users_posts = []

    for u in users_sample:
        user_node_id = [key for key, _ in u.items()][0]
        tweets_by_user = []
        
        for t in tweets:
            tweet_node_id = [key for key, _ in t.items()][0]
            text_tweet = [value for _, value in t.items()][0]['text']

            if tweet_node_id in g[user_node_id]:
                tweets_by_user.append(text_tweet)

        users_posts.append({'user':user_node_id,
                            'tweets':tweets_by_user})
    

    # let us build the empbeddings of our tweets, using a pretrained Word2Vec model on twitter data, using embeddings of length 50 to match the graph structure embeddings
    w2v = gensim.downloader.load('glove-twitter-50')


    # we then compute and add the embedded tweets to the users_posts list, by adding a key to each dictionnary in the list
    for i in users_posts:
        tweets = i['tweets']
        embedded_tweets = []

        if len(tweets) == 0:
            avg_emb_tweet = np.zeros(w2v.vector_size)

        else :
            for j in i['tweets']:
                emb_tweet = sentence_embedding(j)
                embedded_tweets.append(emb_tweet)
                avg_emb_tweet = np.mean(embedded_tweets, axis=0)
        i['embedded_tweets'] = avg_emb_tweet


    # we can now compute the cosine similarity matrix like with our graph structure embeddings
    cos_sim_w2v = [[0 for _ in range(100)] for _ in range(100)]

    for i in range(len(users_posts)):
        for j in range(len(users_posts)):
            if i == j:
                cos_sim_w2v[i][j] = 1
            else:
                emb_i = users_posts[i]['embedded_tweets'].reshape(1, -1)
                emb_j = users_posts[j]['embedded_tweets'].reshape(1, -1)
                cos_sim_w2v[i][j] = cosine_similarity(emb_i, emb_j)[0][0]
    return([cos_sim_n2v, cos_sim_w2v])



    

In [114]:
correlation_results = []

events = ['wildfire', 'earthquake', 'typhoon', 'bombing', 'flood', 'shooting']

for event_type in events:

    res = similarity_matrix(event_type)
    
    graph_similarities = [similarity for sublist in res[0] for similarity in sublist]
    content_similarities = [similarity for sublist in res[1] for similarity in sublist]

    # Compute the correlation between the graph structure and tweet content similarities
    correlation, p_value = spearmanr(graph_similarities, content_similarities)
    
    # Append the correlation coefficient and p-value to the results list
    correlation_results.append({'event_type': event_type, 'correlation': correlation, 'p_value': p_value})


Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 7223.71it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 162.89it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 10793.65it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 118.17it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 7714.37it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 39.68it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 14332.15it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 165.69it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 14316.50it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 174.11it/s]
Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 16727.70it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 148.24it/s]


In [115]:
correlation_results

[{'event_type': 'wildfire',
  'correlation': 0.09009608808072767,
  'p_value': 1.7669296557970214e-19},
 {'event_type': 'earthquake',
  'correlation': 0.09633970565715388,
  'p_value': 4.671064582764964e-22},
 {'event_type': 'typhoon',
  'correlation': 0.08564661866366925,
  'p_value': 9.540388188535034e-18},
 {'event_type': 'bombing',
  'correlation': 0.03394985982712948,
  'p_value': 0.0006849327950416989},
 {'event_type': 'flood',
  'correlation': 0.0349663288447138,
  'p_value': 0.0004700543857120087},
 {'event_type': 'shooting',
  'correlation': -0.02382863183604093,
  'p_value': 0.01717669060263994}]

According to this results, we see that the similarity matrices for the graph structure and tweet contants are more correlated for the flood and shooting event (higher p-value) than for the earthquake and typhoon events. Moreover we see that in general we observe a positive correlation between the two matrices of similarity except for the shooting event.