# Embeddings and User - User Similarities

### Import Packages 📦

In [7]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import networkx as nx
import random

%pip install node2vec
from node2vec import Node2Vec

%pip install gensim
import gensim.downloader
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Select the event to be analyzed, and import the graph 

In [3]:
chosen_event = 'wildfire'

path = f'subgraphs_data/{chosen_event}_subgraph.graphml'
g = nx.read_graphml(path)

### 2.1 - Sampling N users

In [44]:
# first, we need access to the list of users in the graph 
users = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':User':
                users.append({node : data})

# then, we select a random sample of 100 users
random.seed(55)
N = 100
users_sample = random.sample(users, N)

In [53]:
# example of user
users_sample[0]

{'n87752': {'labels': ':User',
  'listed_count': 1,
  'statuses_count': 4943,
  'favourites_count': 23676,
  'isVerified': False,
  'screen_name': 'gillian__paige',
  'friends_count': 211,
  'followers_count': 132,
  'name': 'gillian hebert',
  'tweets_count': 1,
  'id': '1873350745'}}

## 2.ii.a - Embeddings on Graph Structure

In [46]:
# first, we need the associated nodes ids to the sample we definded : 
node_sample_ids = []
for i in range(len(users_sample)):
    for key, _ in users_sample[i].items():
        node_sample_ids.append(key)

# define the subgraph with the sample of 100 nodes
sample_graph = g.subgraph(node_sample_ids)

In [47]:
# we can then build the embeddings of the sampled graph with Node2Vec
node2vec = Node2Vec(sample_graph, dimensions=50)
fitted_model_n2v = node2vec.fit(window=10, min_count=1) 

Computing transition probabilities: 100%|██████████| 100/100 [00:00<00:00, 35496.82it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 1456.81it/s]


We can have access to the 10 most similar nodes of a node, based on cosine similarity

In [48]:
fitted_model_n2v.wv.most_similar('n98232')

[('n85976', 0.44765573740005493),
 ('n99683', 0.373142808675766),
 ('n85300', 0.3534637689590454),
 ('n98362', 0.27789026498794556),
 ('n87000', 0.2662959694862366),
 ('n85755', 0.26048803329467773),
 ('n99430', 0.2583834230899811),
 ('n87335', 0.2576662600040436),
 ('n87158', 0.24259963631629944),
 ('n87394', 0.23346398770809174)]

Or we can build a cosine similarity matrix :

In [49]:
# get the embeddings for every node
node_embeddings_n2v = {node: fitted_model_n2v.wv[node] for node in sample_graph.nodes()}

# list element is easier to handle
list_of_embeddings_n2v = []
for key, value in node_embeddings_n2v.items():
    list_of_embeddings_n2v.append({key : value})

In [50]:
# build the cosine similarity matrix
cos_sim_n2v = [[0 for _ in range(100)] for _ in range(100)]

for i, emb_i_dict in enumerate(list_of_embeddings_n2v):
    for j, emb_j_dict in enumerate(list_of_embeddings_n2v):
        emb_i = next(iter(emb_i_dict.values())) 
        emb_j = next(iter(emb_j_dict.values()))

        cosine_sim = cosine_similarity([emb_i], [emb_j])[0][0]
        cos_sim_n2v[i][j] = cosine_sim

In [51]:
# from this, we can get the more similar users in our sample graph :
arr_cos_sim_n2v = np.array(cos_sim_n2v)
np.fill_diagonal(arr_cos_sim_n2v, -np.inf) # we replace the 1 elements of the diagonal by -inf

# 10 most similar nodes
v = []

nb_users_to_print = 10
for _ in range(nb_users_to_print):
    max_index = np.argmax(arr_cos_sim_n2v)
    max_sim = np.max(arr_cos_sim_n2v)

    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim_n2v).shape)

    arr_cos_sim_n2v[max_row_index][max_col_index] = -np.inf
    arr_cos_sim_n2v[max_col_index][max_row_index] = -np.inf

    v.append((max_row_index, max_col_index, max_sim))

print(f"The {nb_users_to_print} more similar pairs of users are :")
v

The 10 more similar pairs of users are :


[(3, 92, 0.5292955),
 (6, 62, 0.48984912),
 (92, 96, 0.4476558),
 (5, 44, 0.43664718),
 (64, 92, 0.4299842),
 (22, 44, 0.42970902),
 (7, 70, 0.40657914),
 (88, 92, 0.39449665),
 (71, 95, 0.3878101),
 (13, 18, 0.38690877)]

## 2.ii.b - Embeddings on Post Content

In [57]:
# we already have our sampled graph and the users, we now need the tweets
tweets = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':Tweet':
                tweets.append({node : data})

# exemple of tweet
tweets[0]

{'n37162': {'labels': ':Tweet',
  'is_quote_status': False,
  'possibly_sensitive': True,
  'retweet_count': 37,
  'favorite_count': 214,
  'id_str': '1131945414289006592',
  'isTruncated': False,
  'annotation_postPriority': 'High',
  'created_at': '2019-05-24T00:00Z',
  'id': '1131945414289006592',
  'annotation_annotated': True,
  'annotation_num_judgements': 3,
  'text': "I support High Level Mayor Crystal McAteer and her call for emergency relief funding in the form of debit cards for residents displaced by the fire. I'll be reaching out to Minister Madu today and call on him to bring this program in immediately#ableg",
  'topic': 'TRECIS-CTIT-H-029'}}

In [60]:
# build a list containing dictionnaries with the user_node_id and every tweet made by this user
users_posts = []

for u in users_sample:
    user_node_id = [key for key, _ in u.items()][0]
    tweets_by_user = []
    
    for t in tweets:
        tweet_node_id = [key for key, _ in t.items()][0]
        text_tweet = [value for _, value in t.items()][0]['text']

        if tweet_node_id in g[user_node_id]:
            tweets_by_user.append(text_tweet)

    users_posts.append({'user':user_node_id,
                        'tweets':tweets_by_user})
    
# exemple of users posts :
users_posts[:2]

[{'user': 'n87752',
  'tweets': ['literally Jason Kenney and the UCP right now #ABwildfire #AbLeg https://t.co/g3mNEpci2H']},
 {'user': 'n98232',
  'tweets': ['View of Springwood fire looking toward Winmalee from Warrimoo Oval #nswfires #nswrfs http://t.co/ffIvaobke2']}]

In [61]:
# let us build the empbeddings of our tweets, using a pretrained Word2Vec model on twitter data, using embeddings of length 50 to match the graph structure embeddings
w2v = gensim.downloader.load('glove-twitter-50')



In [62]:
# we define a function that allows us to tokenize a tweet/sentence, and we take the average of each embedding
def sentence_embedding(sentence):
    tokens = sentence.split()
    embeddings = [w2v[token] for token in tokens if token in w2v]
    
    if embeddings:
        avg_embedding = np.mean(embeddings, axis=0)
    else:
        avg_embedding = np.zeros(w2v.vector_size)

    return avg_embedding

In [63]:
# we then compute and add the embedded tweets to the users_posts list, by adding a key to each dictionnary in the list
for i in users_posts:
    tweets = i['tweets']
    embedded_tweets = []
    for j in i['tweets']:
        emb_tweet = sentence_embedding(j)
        embedded_tweets.append(emb_tweet)
    avg_emb_tweet = np.mean(embedded_tweets, axis=0)
    i['embedded_tweets'] = avg_emb_tweet

In [64]:
# exemple of new user post :
users_posts[0]

{'user': 'n87752',
 'tweets': ['literally Jason Kenney and the UCP right now #ABwildfire #AbLeg https://t.co/g3mNEpci2H'],
 'embedded_tweets': array([ 0.092144  , -0.0097482 ,  0.2523414 , -0.412006  , -0.110684  ,
         0.381162  ,  1.1097358 ,  0.194222  , -0.10341058, -0.159414  ,
         0.2009306 , -0.3376276 , -5.7268    , -0.26993603,  0.011716  ,
        -0.16570339,  0.17271021, -0.2074794 , -0.384662  , -0.1488848 ,
        -0.12701042, -0.09818981,  0.057602  ,  0.19408801,  0.08912279,
         0.520362  , -0.15948038,  0.01510841,  0.363132  , -0.02014689,
        -0.396355  ,  0.170776  , -0.2349926 , -0.04268146,  0.20499201,
         0.07206976,  0.064438  , -0.0277014 ,  0.27153403,  0.20733199,
        -0.672186  ,  0.031954  , -0.018946  ,  0.108678  ,  0.19266601,
        -0.117673  ,  0.20545301, -0.060766  , -0.1335644 ,  0.04661779],
       dtype=float32)}

In [65]:
# we can now compute the cosine similarity matrix like with our graph structure embeddings
cos_sim_w2v = [[0 for _ in range(100)] for _ in range(100)]

for i in range(len(users_posts)):
    for j in range(len(users_posts)):
        if i == j:
            cos_sim_w2v[i][j] = 1
        else:
            emb_i = users_posts[i]['embedded_tweets'].reshape(1, -1)
            emb_j = users_posts[j]['embedded_tweets'].reshape(1, -1)
            cos_sim_w2v[i][j] = cosine_similarity(emb_i, emb_j)[0][0]

In [68]:
# finally, we can print the most similar users based on this matrix
arr_cos_sim_w2v = np.array(cos_sim_w2v)
np.fill_diagonal(arr_cos_sim_w2v, -np.inf) # replace the diagonal full of 1s by -inf 

v = []

nb_users_to_print = 10
for _ in range(nb_users_to_print):
    max_index = np.argmax(arr_cos_sim_w2v)
    max_sim = np.max(arr_cos_sim_w2v)

    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim_w2v).shape)

    arr_cos_sim_w2v[max_row_index][max_col_index] = -np.inf
    arr_cos_sim_w2v[max_col_index][max_row_index] = -np.inf
    
    v.append((max_row_index, max_col_index, max_sim))

print(f"The {nb_users_to_print} more similar pairs of users are :")
v

The 10 more similar pairs of users are :


[(57, 71, 0.9946486949920654),
 (71, 92, 0.9932003021240234),
 (92, 99, 0.9930979609489441),
 (20, 75, 0.9925435185432434),
 (71, 99, 0.9906183481216431),
 (20, 38, 0.9901840090751648),
 (33, 57, 0.9893929958343506),
 (22, 77, 0.9890386462211609),
 (72, 99, 0.9882469773292542),
 (31, 92, 0.988242506980896)]

In [72]:
# let us look at users 71 and 92 posts (post 57 is very long so bit hard to analyze)
print(users_posts[71]['tweets'])
print()
print(users_posts[92]['tweets'])

['Worst #AirQuality I?ve ever experienced.2.5 weeks on the road in the US talking and learning about #HealthAndWellbeing and return to campfire air from #ABFire. Stay inside, run your #Purifiers and/or go to a place with filtered air. Stop #running outside, please! #ClimateChange', "#AQHI in #YEG #Edmonton is 10+. That. is. bad! Tomorrow it's going down to 5 and the forecast looks promising for this area but continues to be horrible for the North. #ABFire #ThisCantBeTheNewNormal #ClimateChange"]

['@EdLatimore Yeah man! It?s wild\\n\\nI worked a couple days in -40 in northern Alberta and that?s what I found\\n\\nWhen the air is still, it feels like nothing\\n\\nAs soon as you start to walk, your face is on fire']


For the wildifre subgraph for example, it seems that both tweets are talking mainly about air quality specifically !

## 2.iii - Trends in Correlations