# Information Retrieval System

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import networkx as nx

%pip install gensim
import gensim.downloader
from gensim.models import Word2Vec

%pip install nltk
import nltk
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# import the graph 
path = 'database_formated_for_NetworkX.graphml'
g = nx.read_graphml(path)

In [3]:
# retrieve the users of our graph
users = []
nodes_users = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':User':
                nodes_users.append(node)
                users.append({node : data})

# exemple of user
print(users[0])

# retrieve the tweets of our graph
tweets = []
for node, data in g.nodes(data=True):
    for key, value in data.items():
        if key == 'labels':
            if value == ':Tweet':
                tweets.append({node : data['text']})

# exemple of tweet
print(tweets[1])

{'n56568': {'labels': ':User', 'statuses_count': 13211, 'favourites_count': 196, 'isVerified': False, 'screen_name': 'jeffseroka', 'followers_count': 537, 'listed_count': 3, 'name': 'Jeff', 'tweets_count': 1, 'id': '54549327', 'friends_count': 400}}
{'n583': 'RT @northfortynews: Tanker helicopter heads up to Paradise Park to drop water on #HighParkFire. http://t.co/7atRS5cy'}


First, let us build a list containing the Poster of a tweet, his tweet, and the degree centrality of the poster

In [4]:
tweets_and_poster = []
list_centralities = nx.degree_centrality(g)

for t in tweets:
    for key, value in t.items():
        tweet_id = key
    neighb = list(g.neighbors(tweet_id))
    for n in neighb:
        if g.nodes[n]['labels'] == ':User':
            tweets_and_poster.append({'poster' : n,
                                      'tweet' : value,
                                      'deg_centrality' : list_centralities[n]})

In [5]:
# import english stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# function to remove the stopwords from a tweet
def remove_stopwords(tweet):
    words = tweet.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# remove stopwords from all the tweets that we consider
for t in tweets_and_poster:
    t['tweet'] = remove_stopwords(t['tweet'])

# exemple of tweets without stopwords
tweets_and_poster[0]

{'poster': 'n97153',
 'tweet': 'RT @northfortynews: Tanker helicopter heads Paradise Park drop water #HighParkFire. http://t.co/7atRS5cy',
 'deg_centrality': 3.6487694525021436e-05}

Now, let us build a function to get the embeddings of each tweet, weighted by the poster degree centrality 

In [7]:
# Load a pretrained Word2Vec model
w2v = gensim.downloader.load('glove-twitter-50')

In [8]:
def weighted_embeddings(tweet_and_poster):
    tweet = tweet_and_poster['tweet']
    if len(tweet)>0:
        tokens = tweet.split()
        embeddings = [w2v[token] for token in tokens if token in w2v]
        
        if len(tokens) > 0 and embeddings:
                avg_embedding = np.mean(embeddings, axis=0)
        else:
            avg_embedding = np.zeros(w2v.vector_size)
    else:
         avg_embedding = np.zeros(w2v.vector_size)

    # muliply the embedding vector by the poster degree centrality
    avg_embedding *= tweet_and_poster['deg_centrality']
    return avg_embedding

In [9]:
# add the weighted embedding key to the tweet and poster list of dictionnaries
for t in tweets_and_poster:
    t['weighted_embedding'] = weighted_embeddings(t)

# exemple of embedding
tweets_and_poster[0]

{'poster': 'n97153',
 'tweet': 'RT @northfortynews: Tanker helicopter heads Paradise Park drop water #HighParkFire. http://t.co/7atRS5cy',
 'deg_centrality': 3.6487694525021436e-05,
 'weighted_embedding': array([-1.5352563e-05,  7.3326587e-06,  8.6894534e-06, -1.1866437e-05,
        -3.6320762e-06,  7.6861324e-06,  1.2904969e-05, -1.3077737e-05,
         1.7441947e-05, -1.0066772e-05,  9.8510382e-06,  1.2501506e-05,
        -1.3010234e-04, -5.6111962e-06,  1.3717749e-05, -7.6460438e-08,
        -4.5607148e-06, -8.2355555e-06, -6.5395091e-07, -1.9738109e-05,
        -2.4089268e-05, -6.0406201e-06,  1.3080290e-05, -1.9026488e-06,
         9.9687668e-06,  2.1328060e-05, -2.2642258e-05,  1.1495822e-05,
        -1.3339501e-05,  1.0603259e-05, -5.0288395e-06,  1.5014541e-05,
         1.4268059e-06,  1.1140606e-06,  2.3357870e-05, -2.6931195e-06,
        -3.5430830e-06,  1.3394040e-05,  1.6297501e-05,  6.8883296e-06,
        -5.2485721e-06, -1.2152229e-06, -2.3539216e-05, -4.2540187e-06,
    

Now, let us build the top-k tweets retrieval of a query

- Since the query is supposed to be built of keywords, it should not contain stopwords

- If we compute the cosine similarity of the embedded query and all the embeddings, we can retrieve the top-k more relevant tweets

In [10]:
def top_k_retrieval(keywords, k):
    print(keywords)
    print() 

    embedding = [w2v[token] for token in keywords if token in w2v]
    embedding = np.mean(embedding, axis=0).reshape(1, -1)

    similarity_list = [cosine_similarity(embedding, i['weighted_embedding'].reshape(1, -1)) for i in tweets_and_poster]
    
    related_tweets = [i['tweet'] for i in tweets_and_poster]

    sorted_similarity_list = sorted(zip(similarity_list, related_tweets), reverse = True)
    sorted_sims, sorted_tweets = zip(*sorted_similarity_list)

    for i in range(len(sorted_sims[:k])):
        print(f'Cosine Sim : {sorted_sims[i][0][0]} \n Tweet : {sorted_tweets[i]} \n')

In [11]:
query = ['air', 'injury', 'help', 'fire', 'congratulations']
k = 10

top_k_retrieval(query, k)

['air', 'injury', 'help', 'fire', 'congratulations']



Cosine Sim : 0.9344030618667603 
 Tweet : UPDATE: #CycloneKenneth made landfall #Mozambique. 700,000 people live cyclone?s path least three lost lives. @WFP ground preparing emergency response. See help here: https://t.co/NUSL0GBG8w https://t.co/fZWm0it37c 

Cosine Sim : 0.9302331805229187 
 Tweet : staff live #Tacloban 7pm @BBCNews city braces battering #TyphoonHagupit - one year devastating #Haiyan 

Cosine Sim : 0.9292325973510742 
 Tweet : Premier @RachelNotley thanks fire crews working fight fire help residents affected #ymmfire 

Cosine Sim : 0.9281186461448669 
 Tweet : RT @emitoms: "There reports Boston Marathon runners crossing finish line continuing run Hospital give blood v ... 

Cosine Sim : 0.9281186461448669 
 Tweet : RT @emitoms: "There reports Boston Marathon runners crossing finish line continuing run Hospital give blood v ... 

Cosine Sim : 0.927777886390686 
 Tweet : @anupdgn : Government trying best speed rescue operation Bhaktapur 

Cosine Sim : 0.9265118837356567 