# Embeddings and User - User Similarities

### Import Packages 📦

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import networkx as nx
import random

%pip install node2vec
from node2vec import Node2Vec

%pip install gensim
import gensim.downloader
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Select the event to be analyzed, and import the graph 

In [2]:
chosen_event = 'wildfire'

path = f'subgraphs_data/{chosen_event}_subgraph.graphml'
g = nx.read_graphml(path)

### 2.1 - Sampling N users

In [29]:
users = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':User':
                users.append({node : data})

n=100
# Sample N/2 users with highest degree centrality
degree_centralities = nx.degree_centrality(g)
top_degree_users = sorted(degree_centralities, key=lambda x: degree_centralities[x], reverse=True)[0:int(n/2)]

# Sample N/2 users with highest closeness centrality
closeness_centralities = nx.closeness_centrality(g)
top_closeness_users = sorted(closeness_centralities, key=lambda x: closeness_centralities[x], reverse=True)[0:int(n/2)]
list_user = list(set(top_degree_users + top_closeness_users))

users_sample = []

for node_dict in users:
    node_key = list(node_dict.keys())[0]  
    if node_key in list_user:  
        users_sample.append({node_key: node_dict[node_key]})

In [33]:
# # first, we need access to the list of users in the graph 
# users = []
# for node, data in g.nodes(data=True):
#     for key, value in data.items():
#         if key == 'labels':
#             if value == ':User':
#                 users.append({node : data})

# # then, we select a random sample of 100 users
# random.seed(55)
# N = 100
# users_sample = random.sample(users, N)

In [4]:
# example of user
users_sample[0]

{'n86625': {'labels': ':User',
  'isVerified': False,
  'followers_count': 129943,
  'listed_count': 742,
  'statuses_count': 1970,
  'favourites_count': 2,
  'id': '302187548',
  'screen_name': 'AB_EmergAlert',
  'friends_count': 1,
  'name': 'AB Emergencyalert',
  'tweets_count': 19}}

## 2.ii.a - Embeddings on Graph Structure

In [5]:
# first, we need the associated nodes ids to the sample we definded : 
node_sample_ids = []
for i in range(len(users_sample)):
    for key, _ in users_sample[i].items():
        node_sample_ids.append(key)

# define the subgraph with the sample of 100 nodes
sample_graph = g.subgraph(node_sample_ids)

In [6]:
# we can then build the embeddings of the sampled graph with Node2Vec
node2vec = Node2Vec(sample_graph, dimensions=50)
fitted_model_n2v = node2vec.fit(window=10, min_count=1) 

Computing transition probabilities: 100%|██████████| 11/11 [00:00<00:00, 12077.84it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 3521.96it/s]


We can have access to the 10 most similar nodes of a node, based on cosine similarity

In [8]:
fitted_model_n2v.wv.most_similar('n86625')

[('n85174', 0.3614647686481476),
 ('n85146', 0.25389182567596436),
 ('n85177', 0.18587689101696014),
 ('n61678', 0.16183257102966309),
 ('n87456', 0.11143843084573746),
 ('n85147', 0.04336925223469734),
 ('n67357', 0.03535682335495949),
 ('n86646', -0.03276541829109192),
 ('n85214', -0.26173949241638184),
 ('n86626', -0.3077954351902008)]

Or we can build a cosine similarity matrix :

In [9]:
# get the embeddings for every node
node_embeddings_n2v = {node: fitted_model_n2v.wv[node] for node in sample_graph.nodes()}

# list element is easier to handle
list_of_embeddings_n2v = []
for key, value in node_embeddings_n2v.items():
    list_of_embeddings_n2v.append({key : value})

In [10]:
# build the cosine similarity matrix
cos_sim_n2v = [[0 for _ in range(100)] for _ in range(100)]

for i, emb_i_dict in enumerate(list_of_embeddings_n2v):
    for j, emb_j_dict in enumerate(list_of_embeddings_n2v):
        emb_i = next(iter(emb_i_dict.values())) 
        emb_j = next(iter(emb_j_dict.values()))

        cosine_sim = cosine_similarity([emb_i], [emb_j])[0][0]
        cos_sim_n2v[i][j] = cosine_sim

In [11]:
# from this, we can get the more similar users in our sample graph :
arr_cos_sim_n2v = np.array(cos_sim_n2v)
np.fill_diagonal(arr_cos_sim_n2v, -np.inf) # we replace the 1 elements of the diagonal by -inf

# 10 most similar nodes
v = []

nb_users_to_print = 10
for _ in range(nb_users_to_print):
    max_index = np.argmax(arr_cos_sim_n2v)
    max_sim = np.max(arr_cos_sim_n2v)

    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim_n2v).shape)

    arr_cos_sim_n2v[max_row_index][max_col_index] = -np.inf
    arr_cos_sim_n2v[max_col_index][max_row_index] = -np.inf

    v.append((max_row_index, max_col_index, max_sim))

print(f"The {nb_users_to_print} more similar pairs of users are :")
v

The 10 more similar pairs of users are :


[(4, 8, 0.5346932411193848),
 (1, 8, 0.36146482825279236),
 (5, 8, 0.3480015993118286),
 (3, 8, 0.34607675671577454),
 (1, 5, 0.25389179587364197),
 (0, 5, 0.23847448825836182),
 (3, 5, 0.23539096117019653),
 (6, 9, 0.21279771625995636),
 (6, 8, 0.18825682997703552),
 (1, 4, 0.18587687611579895)]

## 2.ii.b - Embeddings on Post Content

In [12]:
# we already have our sampled graph and the users, we now need the tweets
tweets = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':Tweet':
                tweets.append({node : data})

# exemple of tweet
tweets[0]

{'n37162': {'labels': ':Tweet',
  'is_quote_status': False,
  'possibly_sensitive': True,
  'retweet_count': 37,
  'favorite_count': 214,
  'id_str': '1131945414289006592',
  'isTruncated': False,
  'annotation_postPriority': 'High',
  'created_at': '2019-05-24T00:00Z',
  'id': '1131945414289006592',
  'annotation_annotated': True,
  'annotation_num_judgements': 3,
  'text': "I support High Level Mayor Crystal McAteer and her call for emergency relief funding in the form of debit cards for residents displaced by the fire. I'll be reaching out to Minister Madu today and call on him to bring this program in immediately#ableg",
  'topic': 'TRECIS-CTIT-H-029'}}

In [13]:
# build a list containing dictionnaries with the user_node_id and every tweet made by this user
users_posts = []

for u in users_sample:
    user_node_id = [key for key, _ in u.items()][0]
    tweets_by_user = []
    
    for t in tweets:
        tweet_node_id = [key for key, _ in t.items()][0]
        text_tweet = [value for _, value in t.items()][0]['text']

        if tweet_node_id in g[user_node_id]:
            tweets_by_user.append(text_tweet)

    users_posts.append({'user':user_node_id,
                        'tweets':tweets_by_user})
    
# exemple of users posts :
users_posts[:2]

[{'user': 'n86625',
  'tweets': ['Wildfire Alert Updated May30 1141AM Take necessary precautions. Lesser Slave River... https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
   'Wildfire Alert Ended May21 942AM Yellowhead County https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
   'Wildfire Alert Updated May29 1256PM Take necessary precautions. Mackenzie #23 https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
   'Wildfire Alert May30 1219PM Take necessary precautions. Lesser Slave River #124 https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
   'Wildfire Alert Updated May31 814AM Take necessary precautions. Peerless Lake First... https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
   'Wildfire Alert Updated May02 337AM Take necessary precautions. Regional Municipality of... https://t.co/B6uO3E9qa7 #ABfire #ABemerg',
   'Wildfire Alert Updated May03 301PM Take necessary precautions. Regional Municipality of... https://t.co/B6uO3E9qa7 #ABfire #ABemerg',
   'Wildfire Alert Updated May03 234PM Take necessary precautions. Regional Mu

In [14]:
# let us build the empbeddings of our tweets, using a pretrained Word2Vec model on twitter data, using embeddings of length 50 to match the graph structure embeddings
w2v = gensim.downloader.load('glove-twitter-50')

In [15]:
# we define a function that allows us to tokenize a tweet/sentence, and we take the average of each embedding
def sentence_embedding(sentence):
    tokens = sentence.split()
    embeddings = [w2v[token] for token in tokens if token in w2v]
    
    if embeddings:
        avg_embedding = np.mean(embeddings, axis=0)
    else:
        avg_embedding = np.zeros(w2v.vector_size)

    return avg_embedding

In [19]:
# we then compute and add the embedded tweets to the users_posts list, by adding a key to each dictionnary in the list
for i in users_posts:
    tweets = i['tweets']
    embedded_tweets = []
    for j in i['tweets']:
        emb_tweet = sentence_embedding(j)
        embedded_tweets.append(emb_tweet)
    avg_emb_tweet = np.mean(embedded_tweets, axis=0)
    i['embedded_tweets'] = avg_emb_tweet

In [20]:
# exemple of new user post :
users_posts[0]

{'user': 'n86625',
 'tweets': ['Wildfire Alert Updated May30 1141AM Take necessary precautions. Lesser Slave River... https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert Ended May21 942AM Yellowhead County https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert Updated May29 1256PM Take necessary precautions. Mackenzie #23 https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert May30 1219PM Take necessary precautions. Lesser Slave River #124 https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert Updated May31 814AM Take necessary precautions. Peerless Lake First... https://t.co/L8ht6OCtPQ #ABfire #ABemerg',
  'Wildfire Alert Updated May02 337AM Take necessary precautions. Regional Municipality of... https://t.co/B6uO3E9qa7 #ABfire #ABemerg',
  'Wildfire Alert Updated May03 301PM Take necessary precautions. Regional Municipality of... https://t.co/B6uO3E9qa7 #ABfire #ABemerg',
  'Wildfire Alert Updated May03 234PM Take necessary precautions. Regional Municipalit

In [21]:
# we can now compute the cosine similarity matrix like with our graph structure embeddings
cos_sim_w2v = [[0 for _ in range(100)] for _ in range(100)]

for i in range(len(users_posts)):
    for j in range(len(users_posts)):
        if i == j:
            cos_sim_w2v[i][j] = 1
        else:
            emb_i = users_posts[i]['embedded_tweets'].reshape(1, -1)
            emb_j = users_posts[j]['embedded_tweets'].reshape(1, -1)
            cos_sim_w2v[i][j] = cosine_similarity(emb_i, emb_j)[0][0]

In [25]:
# finally, we can print the most similar users based on this matrix
arr_cos_sim_w2v = np.array(cos_sim_w2v)
np.fill_diagonal(arr_cos_sim_w2v, -np.inf) # replace the diagonal full of 1s by -inf 

v = []

nb_users_to_print = 10
for _ in range(nb_users_to_print):
    max_index = np.argmax(arr_cos_sim_w2v)
    max_sim = np.max(arr_cos_sim_w2v)

    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim_w2v).shape)

    arr_cos_sim_w2v[max_row_index][max_col_index] = -np.inf
    arr_cos_sim_w2v[max_col_index][max_row_index] = -np.inf
    
    v.append((max_row_index, max_col_index, max_sim))

print(f"The {nb_users_to_print} more similar pairs of users are :")
v

The 10 more similar pairs of users are :


[(4, 5, 0.9977604065347774),
 (3, 8, 0.9971410632133484),
 (2, 7, 0.9968248605728149),
 (3, 9, 0.9965514943425788),
 (5, 9, 0.9964904856647377),
 (5, 8, 0.9963920036732998),
 (3, 5, 0.995724766739027),
 (2, 4, 0.9955658316612244),
 (2, 5, 0.9952108696080185),
 (4, 9, 0.9949992779441142)]

In [26]:
# let us look at users 71 and 92 posts (post 57 is very long so bit hard to analyze)
print(users_posts[4]['tweets'])
print()
print(users_posts[5]['tweets'])




For the wildifre subgraph for example, it seems that both tweets are talking mainly about air quality specifically !

## 2.iii - Trends in Correlations