In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import random

%pip install gensim
import gensim.downloader
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [2]:
chosen_event = 'wildfire'

In [3]:
path = f'subgraphs_data/{chosen_event}_subgraph.graphml'
g = nx.read_graphml(path)

The first step is to try to obtain a dictionnary containing the users id and all of their posts in the network as a dictionnary

In [103]:
# let us first get access to the users of the chosen event
users = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':User':
                users.append({node : data})

# now, let us get the tweets related to the chosen event
tweets = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':Tweet':
                tweets.append({node : data})

In [104]:
# select first a sample of 100 users
users_sample = random.sample(users, 100)

In [105]:
# build a list containing dictionnaries with the user_node_id and every tweet made by this user
users_posts = []

for u in users_sample:
    user_node_id = [key for key, _ in u.items()][0]
    tweets_by_user = []
    
    for t in tweets:
        tweet_node_id = [key for key, _ in t.items()][0]
        text_tweet = [value for _, value in t.items()][0]['text']

        if tweet_node_id in g[user_node_id]:
            tweets_by_user.append(text_tweet)

    users_posts.append({'user':user_node_id,
                        'tweets':tweets_by_user})

In [67]:
# we then import a pretrained Word2Vec model, fitted for twitter data 
w2v = gensim.downloader.load('glove-twitter-25')

In [106]:
# we define a function that allows us to tokenize a tweet/sentance, and we take the average of each embedding
def sentence_embedding(sentence):
    tokens = sentence.split()
    embeddings = [w2v[token] for token in tokens if token in w2v]
    #embeddings = [emb for emb in embeddings if emb is not None]
    
    if embeddings:
        avg_embedding = np.mean(embeddings, axis=0)
    else:
        avg_embedding = np.zeros(w2v.vector_size)

    #avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding

In [107]:
# we then add the embedded tweets to the users_posts list, by adding a key to each dictionnary in the list
for i in users_posts:
    tweets = i['tweets']
    embedded_tweets = []
    for j in i['tweets']:
        emb_tweet = sentence_embedding(j)
        embedded_tweets.append(emb_tweet)
    avg_emb_tweet = np.mean(embedded_tweets, axis=0)
    i['embedded_tweets'] = avg_emb_tweet

In [110]:
users_posts

[{'user': 'n87821',
  'tweets': ['Alberta: Local emergency, evacuations as wildfires grow. Smoke from these wildfires are causing poor air quality and reduced visibilities, prompting Environment Canada to issue an air quality statement for the High Level area. https://t.co/csrP0ESdxd'],
  'embedded_tweets': array([-0.07259167,  0.15267378, -0.39197123,  0.09393782, -0.16129772,
          0.1721165 ,  0.7235223 , -0.72840047,  0.3575331 ,  0.06393523,
          0.2583738 ,  0.18432489, -3.6803808 ,  0.34233382,  0.34026188,
         -0.12261296,  0.15565377, -0.27480686,  0.3105687 , -0.4527753 ,
         -0.39631817, -0.0576015 , -0.09125669, -0.437162  , -0.5156964 ],
        dtype=float32)},
 {'user': 'n86204',
  'tweets': ['The news and pics coming out of Fort McMurray are unbelievable!?\\n#StaySafe #ymmfire https://t.co/P2yvPlwpPo'],
  'embedded_tweets': array([-0.07493   ,  0.17050958,  0.30005315, -0.4073712 , -0.02270657,
         -0.23016927,  1.3924571 , -0.3015329 , -0.286634

In [125]:
# now, let us compute the user-user similarities with a cosine similarity measure
cos_sim = [[0 for _ in range(100)] for _ in range(100)]

for i in range(len(users_posts)):
    for j in range(len(users_posts)):
        if i == j:
            cos_sim[i][j] = 1
        else:
            emb_i = users_posts[i]['embedded_tweets'].reshape(1, -1)
            emb_j = users_posts[j]['embedded_tweets'].reshape(1, -1)
            cos_sim[i][j] = cosine_similarity(emb_i, emb_j)[0][0]

In [129]:
arr_cos_sim = np.array(cos_sim)
np.fill_diagonal(arr_cos_sim, -np.inf)

v = []

for _ in range(10):
    max_index = np.argmax(arr_cos_sim)
    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim).shape)
    arr_cos_sim[max_row_index][max_col_index] = -np.inf
    arr_cos_sim[max_col_index][max_row_index] = -np.inf
    v.append((max_row_index, max_col_index))

In [134]:
v

[(36, 55),
 (11, 20),
 (55, 72),
 (56, 61),
 (28, 35),
 (76, 85),
 (36, 72),
 (27, 56),
 (32, 55),
 (27, 61)]

In [139]:
# let us try to plot tweets that were considered similar together
for i in range(len(v)):
    first_tweet = users_posts[v[i][0]]['tweets']
    sec_tweet = users_posts[v[i][1]]['tweets']
    print(f'Cosine User Similarity between users {v[i][0]} and {v[i][1]} : {cos_sim[v[i][0]][v[i][1]]}')
    print(first_tweet)
    print(sec_tweet)
    print()

Cosine User Similarity between users 36 and 55 : 0.9954274296760559
["I've seen this tweeted already, but there's only one road in and out of Fort McMurray. #ymmfire"]
["This is bad. 63's the only way out south. #ymmfire https://t.co/mRJoNIHSkx"]

Cosine User Similarity between users 11 and 20 : 0.9953211545944214
['RT @WeatherInsiderr: RED ALERT: What looks to be a FIRE TORNADO just south of BEACON HILL DRIVE.. FRIGHTENING! #ymm #ABfire #ymmfire https:?', "@guacamayan there are people losing everything right now. They'll need help, not lectures from those safely far removed. #ymm #ymmfire"]
['@gage_kristi Kristi they have some burning there too but I?m not exactly sure of where they are all located.  For us ours are up North in the High Level area.   But the smoke has filtered its way here.  Calling for storms tomorrow so we will see.   ?']

Cosine User Similarity between users 55 and 72 : 0.9946712851524353
["This is bad. 63's the only way out south. #ymmfire https://t.co/mRJoNIHSkx"

**Note that this interpretation is related to wildfire events only**

Some results are quite interesting to look at here :

- First, it is rather obvious that many users will have a high similarity based on their posts, as they all mention wildfire events, so they are likely to use an identical vocabulary.

- For the two most similar users, we see that their tweets mention the fact that there is only one way out to escape the wilfire, but differently, so that it was understood by the model that the rough same idea was transmitted in the tweet. 

- Also , on the 6th example given above, where both tweets mention people saving lives, but in a slightly different manner, suggesting that the Word2Vec model understood their similarity, which is very interesting.