In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import random

%pip install gensim
import gensim.downloader
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [2]:
chosen_event = 'wildfire'

In [3]:
path = f'subgraphs_data/{chosen_event}_subgraph.graphml'
g = nx.read_graphml(path)

The first step is to try to obtain a dictionnary containing the users id and all of their posts in the network as a dictionnary

In [149]:
# let us first get access to the users of the chosen event
users = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':User':
                users.append({node : data})

# now, let us get the tweets related to the chosen event
tweets = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':Tweet':
                tweets.append({node : data})

In [150]:
# select first a sample of 100 users
random.seed(55)
users_sample = random.sample(users, 100)

In [151]:
# build a list containing dictionnaries with the user_node_id and every tweet made by this user
users_posts = []

for u in users_sample:
    user_node_id = [key for key, _ in u.items()][0]
    tweets_by_user = []
    
    for t in tweets:
        tweet_node_id = [key for key, _ in t.items()][0]
        text_tweet = [value for _, value in t.items()][0]['text']

        if tweet_node_id in g[user_node_id]:
            tweets_by_user.append(text_tweet)

    users_posts.append({'user':user_node_id,
                        'tweets':tweets_by_user})

In [152]:
# we then import a pretrained Word2Vec model, fitted for twitter data 
w2v = gensim.downloader.load('glove-twitter-25')

In [153]:
# we define a function that allows us to tokenize a tweet/sentance, and we take the average of each embedding
def sentence_embedding(sentence):
    tokens = sentence.split()
    embeddings = [w2v[token] for token in tokens if token in w2v]
    #embeddings = [emb for emb in embeddings if emb is not None]
    
    if embeddings:
        avg_embedding = np.mean(embeddings, axis=0)
    else:
        avg_embedding = np.zeros(w2v.vector_size)

    #avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding

In [154]:
# we then add the embedded tweets to the users_posts list, by adding a key to each dictionnary in the list
for i in users_posts:
    tweets = i['tweets']
    embedded_tweets = []
    for j in i['tweets']:
        emb_tweet = sentence_embedding(j)
        embedded_tweets.append(emb_tweet)
    avg_emb_tweet = np.mean(embedded_tweets, axis=0)
    i['embedded_tweets'] = avg_emb_tweet

In [155]:
# now, let us compute the user-user similarities with a cosine similarity measure
cos_sim = [[0 for _ in range(100)] for _ in range(100)]

for i in range(len(users_posts)):
    for j in range(len(users_posts)):
        if i == j:
            cos_sim[i][j] = 1
        else:
            emb_i = users_posts[i]['embedded_tweets'].reshape(1, -1)
            emb_j = users_posts[j]['embedded_tweets'].reshape(1, -1)
            cos_sim[i][j] = cosine_similarity(emb_i, emb_j)[0][0]

In [156]:
arr_cos_sim = np.array(cos_sim)
np.fill_diagonal(arr_cos_sim, -np.inf)

v = []

for _ in range(10):
    max_index = np.argmax(arr_cos_sim)
    max_row_index, max_col_index = np.unravel_index(max_index, np.array(arr_cos_sim).shape)
    arr_cos_sim[max_row_index][max_col_index] = -np.inf
    arr_cos_sim[max_col_index][max_row_index] = -np.inf
    v.append((max_row_index, max_col_index))

In [157]:
v

[(57, 71),
 (20, 75),
 (92, 99),
 (71, 92),
 (22, 77),
 (33, 57),
 (31, 92),
 (20, 38),
 (71, 99),
 (41, 50)]

In [158]:
# let us try to plot tweets that were considered similar together
for i in range(len(v)):
    first_tweet = users_posts[v[i][0]]['tweets']
    sec_tweet = users_posts[v[i][1]]['tweets']
    print(f'Cosine User Similarity between users {v[i][0]} and {v[i][1]} : {cos_sim[v[i][0]][v[i][1]]}')
    print(first_tweet)
    print(sec_tweet)
    print()

Cosine User Similarity between users 57 and 71 : 0.9958186149597168
['Worst #AirQuality I?ve ever experienced.2.5 weeks on the road in the US talking and learning about #HealthAndWellbeing and return to campfire air from #ABFire. Stay inside, run your #Purifiers and/or go to a place with filtered air. Stop #running outside, please! #ClimateChange', "#AQHI in #YEG #Edmonton is 10+. That. is. bad! Tomorrow it's going down to 5 and the forecast looks promising for this area but continues to be horrible for the North. #ABFire #ThisCantBeTheNewNormal #ClimateChange"]

Cosine User Similarity between users 20 and 75 : 0.9952951669692993
['One member from the Town of Innisfail along with three others from Red Deer County have been deployed to help fight the Chuckegg wildfire. The crew will attend the fire for 7 days, but the Town is prepared to provide additional support if requested.\\n#InnisfailAB #abfire https://t.co/6O9b0KBs33']
['Our Emergency Disaster Services(EDS) team continues to prov

**Note that this interpretation is related to wildfire events only**

Some results are quite interesting to look at here :

- First, it is rather obvious that many users will have a high similarity based on their posts, as they all mention wildfire events, so they are likely to use an identical vocabulary.

- Interesting results can be seen in the example above, as there are similar users for which we can observe some identical words in their post. It is the case for the last example, where we see that both users use the expression : My/our "thoughts".