# Information Retrieval and Web Analytics
# PROJECT PART 3: Ranking

Gerard Morales - NIA: 242781

Patricia Garay - NIA: 229260

Maren Clapers - NIA: 243397


In [1]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Python packages

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
!pip install demoji

Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [4]:
import collections
from collections import Counter, defaultdict
import json
import math
import matplotlib.pyplot as plt
import numpy as np
from numpy import linalg as la
import pandas as pd
import re
import string
import time

from array import array
from gensim.models.word2vec import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.manifold import TSNE
from wordcloud import WordCloud

import demoji
import spacy

## Documents and Tweets mapping

In [5]:
docs_path = '/content/drive/Shared drives/RIAW/Project/Part1/IRWA_data_2023/Rus_Ukr_war_data.json'

# Open and read the json file
with open(docs_path, 'r') as file:
    data = file.read()

# Split the content into individual JSON objectsÇ
# Each JSON object must be on a separate line
lines = data.strip().split('\n')

# Tweets dictionary identified by ID
tweets_dict = {}

for line in lines:
    tweet = json.loads(line)
    tweet_id = tweet['id']

    # For each tweet: Tweet | Date | Hashtags | Likes | Retweets | Url
    tweet_info = {
        "tweet_id": tweet_id,
        "tweet": tweet.get("full_text", ""),
        "date": tweet.get("created_at", ""),
        "hashtags": [tag["text"] for tag in tweet["entities"]["hashtags"]],
        "likes": tweet.get("favorite_count", 0), # 0 is the default value in case it does not exist
        "retweets": tweet.get("retweet_count", 0),
        "url": tweet['entities']['media'][0]['expanded_url'] if 'media' in tweet['entities'] else ""
    }

    tweets_dict[tweet_id] = tweet_info

In [6]:
# Convert data into a DataFrame with Tweet | Date | Hashtags | Likes | Retweets | Url
tweets_list = list(tweets_dict.values())
tweets_df = pd.DataFrame(tweets_list)

In [7]:
original_df = tweets_df.copy()
original_df.head(5)

Unnamed: 0,tweet_id,tweet,date,hashtags,likes,retweets,url
0,1575918221013979136,@MelSimmonsFCDO Wrong. Dictator Putin's Fascis...,Fri Sep 30 18:39:17 +0000 2022,"[RussiainvadesUkraine, UkraineRussiaWar]",0,0,
1,1575918081461080065,🇺🇦❤️ The Armed Forces liberated the village of...,Fri Sep 30 18:38:44 +0000 2022,"[Drobysheve, Lymansk, Donetsk, UkraineRussiaWa...",0,0,https://twitter.com/Feher_Junior/status/157591...
2,1575917992390823936,ALERT 🚨Poland preps anti-radiation tablets ove...,Fri Sep 30 18:38:23 +0000 2022,"[NATO, Putin, Russia, RussiaInvadedUkraine, Uk...",0,0,
3,1575917907774967809,I’m still waiting for my google map 🗺️ to upda...,Fri Sep 30 18:38:03 +0000 2022,"[Putin, UkraineRussiaWar]",0,0,
4,1575917878410301441,@EmmanuelMacron probably you're right or you h...,Fri Sep 30 18:37:56 +0000 2022,"[European, UkraineRussiaWar]",0,0,


In [8]:
def build_terms(line):
    """
    Preprocess the tweets by removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.

    Argument:
    line -- string (text) to be preprocessed

    Returns:
    tokens - a list of tokens corresponding to the input text after the preprocessing
    """
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    # Remove URLs using a regular expression
    line = re.sub(r'http\S+|www\S+|https\S+', '', line)

    # Replace special characters
    line = line.replace('’', ' ').replace('“', ' ').replace('”', ' ').replace('‘', '')

    line = line.lower()  # Transform in lowercase
    line = demoji.replace(line, '')  # Remove emojis

    # Tokenize the text while preserving hashtags
    tokens = re.findall(r'\w+|#\w+', line)

    # Remove punctuation from words (excluding hashtags)
    tokens = [re.sub(r'[{}]'.format(string.punctuation), '', token) if not token.startswith('#') else token for token in tokens]

    tokens = [w for w in tokens if w not in stop_words]  # Eliminate stopwords
    tokens = [stemmer.stem(w) for w in tokens]  # Perform stemming
    return tokens

In [9]:
## Apply the build_terms() function to the tweets of the dataFrame
tweets_df['tweet'] = tweets_df['tweet'].apply(build_terms)
## Convert 'Date' to a pandas datetime object
tweets_df['date'] = pd.to_datetime(tweets_df['date'])

In [10]:
docs_id_path = '/content/drive/Shared drives/RIAW/Project/Part1/IRWA_data_2023/Rus_Ukr_war_data_ids.csv'
tweet_document_ids = pd.read_csv(docs_id_path, sep='\t', header=None, names=['document_id', 'tweet_id'])
tweet_document_ids_df = pd.DataFrame(tweet_document_ids)
tweet_document_ids_df

Unnamed: 0,document_id,tweet_id
0,doc_1,1575918221013979136
1,doc_2,1575918081461080065
2,doc_3,1575917992390823936
3,doc_4,1575917907774967809
4,doc_5,1575917878410301441
...,...,...
3995,doc_3996,1575154617620504576
3996,doc_3997,1575154444165156864
3997,doc_3998,1575154440012812288
3998,doc_3999,1575154351273873410


In [11]:
## Joining the dataframes 'tweets_df' and 'tweet_document_ids_df'
tweets_df = pd.merge(tweets_df, tweet_document_ids_df, how='inner', on='tweet_id')
tweets_df.head()

Unnamed: 0,tweet_id,tweet,date,hashtags,likes,retweets,url,document_id
0,1575918221013979136,"[melsimmonsfcdo, wrong, dictat, putin, fascist...",2022-09-30 18:39:17+00:00,"[RussiainvadesUkraine, UkraineRussiaWar]",0,0,,doc_1
1,1575918081461080065,"[arm, forc, liber, villag, #drobyshev, #lymans...",2022-09-30 18:38:44+00:00,"[Drobysheve, Lymansk, Donetsk, UkraineRussiaWa...",0,0,https://twitter.com/Feher_Junior/status/157591...,doc_2
2,1575917992390823936,"[alert, poland, prep, anti, radiat, tablet, nu...",2022-09-30 18:38:23+00:00,"[NATO, Putin, Russia, RussiaInvadedUkraine, Uk...",0,0,,doc_3
3,1575917907774967809,"[still, wait, googl, map, updat, russia, new, ...",2022-09-30 18:38:03+00:00,"[Putin, UkraineRussiaWar]",0,0,,doc_4
4,1575917878410301441,"[emmanuelmacron, probabl, right, say, anyway, ...",2022-09-30 18:37:56+00:00,"[European, UkraineRussiaWar]",0,0,,doc_5


# PART 2: Indexing and Evaluation

These are our 5 queries defined in the PART 2 of this project:

In [12]:
query1 = 'Presidents visiting Kyiv'
query2 = 'Countries supporting Ukraine'
query3 = 'Humanitarian aid in Ukraine'
query4 = 'Citizens fleeing Ukraine'
query5 = 'Putin and Zelensky peace talks'

# PART 3: Ranking

## 1. Two ways of ranking

### 1.1. TF-IDF + cosine similarity

Function to implement the inverted index and compute tf, df and idf:

In [13]:
def create_index_tfidf(lines, num_tweets):

    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    tweet_id_index = dict(zip(lines['tweet_id'], lines['tweet']))
    idf = defaultdict(float)

    for id, row in lines.iterrows():

        tweet_id = row['tweet_id']
        tweet_text = row['tweet']
        terms = tweet_text

        current_tweet_index = {}

        for position, term in enumerate(terms): # terms contains tweet_text. Loop over all terms
            try:
                current_tweet_index[term][1].append(position)
            except:
                current_tweet_index[term]=[tweet_id, array('I',[position])]

        #normalize term frequencies
        norm = 0
        for term, posting in current_tweet_index.items():
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        # calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term_tweet, posting_tweet in current_tweet_index.items():
            tf[term_tweet].append(np.round(len(current_tweet_index[term_tweet])/norm,4))
            df[term_tweet] += 1

        #merge the current page index with the main index
        for term_tweet, posting_tweet in current_tweet_index.items():
            index[term_tweet].append(posting_tweet)

        # Compute IDF
        for term in df:
            idf[term] = np.round(np.log(float(num_tweets/df[term])), 4)

    return index, tf, df, idf, tweet_id_index

Index creation using tf-idf:

In [14]:
lines = tweets_df
num_tweets = len(tweets_df)

start_time = time.time()
index, tf, df, idf, tweet_id_index = create_index_tfidf(lines, num_tweets)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 250.52 seconds


Function to perform the ranking of the results of a search based on the **TF-IDF** weights using **cosine similarity**:

In [15]:
def rank_documents(terms, tweets, index, idf, tf):

    # We are interested only on the element of the docVector corresponding to the query terms
    tweet_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex]=query_terms_count[term]/query_norm*idf[term]


        # Generate tweet_vectors for matching tweets
        for tweet_index, (tweet, postings) in enumerate(index[term]):
            if tweet in tweets:
              tweet_vectors[tweet][termIndex] = tf[term][tweet_index] * idf[term]

    # Calculate the score of each doc
    # compute the cosine similarity between queyVector and each docVector:
    tweet_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in tweet_vectors.items() ]
    tweet_scores.sort(reverse=True)
    result_tweets = [x[1] for x in tweet_scores]
    tweet_scores= [x[0] for x in tweet_scores]

    if len(result_tweets) == 0:
        print("No results found, try again")
        query = input()
        tweets = search_tf_idf(query, index)

    return result_tweets, tweet_scores

Function to get the list of documents that contain any of the query terms:

In [16]:
def search_tf_idf(query, index):
    query = build_terms(query)
    tweets = set()
    for term in query:
        try:
            # store in term_tweets the ids of the tweets that contain "term"
            term_tweets=[posting[0] for posting in index[term]]

            # tweets = tweets Union term_tweets
            tweets = tweets.union(term_tweets)
        except:
            #term is not in index
            pass
    tweets = list(tweets)
    ranked_tweets, tweet_scores = rank_documents(query, tweets, index, idf, tf)
    return ranked_tweets, tweet_scores

### Ranking TF-IDF

#### Query 1:


In [17]:
ranked_tweets, tweet_scores = search_tf_idf(query1, index)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 1:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1


Top 5 results out of 125 for the query 1:
[0m
[1mTweet 1:[0m
Tweet_id = 1575768808337412098
Tweet text: #UkraineRussiaWar #UK #UnitedKingdom #GreatBritain #Kyiv #SupportUkraine #StandWithUkraine British defense minister paid secret visit to Kyiv this week https://t.co/AyeLy3StYb

[1mTweet 2:[0m
Tweet_id = 1575827125159940096
Tweet text: Russian President Putin calls on Kyiv to 'immediately' cease military action 
https://t.co/duGzYIEnuR 
#Putin #Russian #UkraineRussiaWar https://t.co/xMaaRgA5Hw

[1mTweet 3:[0m
Tweet_id = 1575827101030064128
Tweet text: #UkraineRussiaWar | Russian President Vladimir Putin calls on Kyiv to 'immediately' cease military action | reported by news agency AFP

[1mTweet 4:[0m
Tweet_id = 1575511622302695425
Tweet text: Clown Steven Seagal visits injured Russian soldiers in hospital.
#Russian #Russia #Ukraine #Ukrainian #UkraineWar #UkraineRussiaWar #RussiaIsATerroristState https://t.co/lN4ZWr7QAI

[1mTweet 5:[0m
Tweet_id = 1575905170952982529
Tweet 

#### Query 2:

In [18]:
ranked_tweets, tweet_scores = search_tf_idf(query2, index)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 2:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1



Top 5 results out of 1034 for the query 2:
[0m
[1mTweet 1:[0m
Tweet_id = 1575159106356203523
Tweet text: What about our ability to defense our country??? #UkraineRussiaWar 
https://t.co/zCnIPPU3sU

[1mTweet 2:[0m
Tweet_id = 1575669419551862784
Tweet text: No fly zones, loss of overseas Russian assets with financial compensation to Ukraine, with increased support from countries on the side lines atm.
#Ukraine #UkraineWar #UkrainianArmy #UkraineRussiaWar #UkraineWillWin #Russia #Russian #RussianArmy https://t.co/YIr358oh2z

[1mTweet 3:[0m
Tweet_id = 1575784366957301760
Tweet text: #DOPPELGANGER: How Russia-based actors cloned legitimate media outlets from multiple countries (🇩🇪🇬🇧🇫🇷🇮🇹🇱🇻🇺🇦) to spread #disinformation designed to undermine the support for #Ukraine. #UkraineRussiaWar 

@DisinfoEU last investigation: https://t.co/lydkTWOgW6

[1mTweet 4:[0m
Tweet_id = 1575206589287452672
Tweet text: @Google why #ukraine language is not supported? #UkraineRussiaWar https://t.co/dEvBOpa

#### Query 3:

In [19]:
ranked_tweets, tweet_scores = search_tf_idf(query3, index)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 3:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1



Top 5 results out of 937 for the query 3:
[0m
[1mTweet 1:[0m
Tweet_id = 1575764098260029440
Tweet text: The people in the convoy had planned to travel to the Russian-occupied part of #Zaporizhzhia to pick up their relatives and deliver humanitarian aid.
#RussianUkrainianWar #Ukraine #StandWithUkraine #civiliancasualties #UkraineRussiaWar #UkraineCrisis #PutinWarCriminal https://t.co/IKbZRk6dTH

[1mTweet 2:[0m
Tweet_id = 1575722456823738370
Tweet text: ❗Russian missile attack on the humanitarian convoy from Zaporizhzhia to occupied territories. People were heading to rescue their relatives left in russia-controlled areas and provide humanitarian aid.

#UkraineRussiaWar https://t.co/ZlYntJRliA

[1mTweet 3:[0m
Tweet_id = 1575757996663377920
Tweet text: 30.09.22.
Zaporizhzhia, Ukraine. 

Those people are volunteers who wanted to give some humanitarian aid for people there and leave town with their relatives. But russia had another plan. 

#russiaisateroriststate #RussiaIsANaziState

#### Query 4:

In [20]:
ranked_tweets, tweet_scores = search_tf_idf(query4, index)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 4:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1



Top 5 results out of 974 for the query 4:
[0m
[1mTweet 1:[0m
Tweet_id = 1575677477460275200
Tweet text: I have been hearing that thousands of Russian men fleeing to Mongolia 🇲🇳. #UkraineRussiaWar https://t.co/Dz9P48CuIV

[1mTweet 2:[0m
Tweet_id = 1575688594530115586
Tweet text: Russia bombing innocent citizens.
This must stop.
#UkraineRussiaWar https://t.co/esEBivfx2A

[1mTweet 3:[0m
Tweet_id = 1575180780568924160
Tweet text: Lukashenko is advising Putin not to worry about those fleeing mobilization: 
#Russia #UkraineRussiaWar #Belarus https://t.co/Lu55nrfphL

[1mTweet 4:[0m
Tweet_id = 1575749242412761089
Tweet text: #UkraineRussiaWar #Ukraine #Russia

🌐 Social media
Russians forces reportedly flees from #Lyman https://t.co/q59JOGhM9E

[1mTweet 5:[0m
Tweet_id = 1575639975567073280
Tweet text: Opinion | Russians flee as Putin announces Ukraine annexation - The Washington Post https://t.co/zFZnMnLdlV #Ukraine  #UkraineRussiaWar #UkraineUnderAttack #UkraineWarNews



#### Query 5:

In [21]:
ranked_tweets, tweet_scores = search_tf_idf(query5, index)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 5:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1



Top 5 results out of 591 for the query 5:
[0m
[1mTweet 1:[0m
Tweet_id = 1575916840286507009
Tweet text: I don’t recall all of this peace/truce talk when people thought Russia was winning. #UkraineRussiaWar

[1mTweet 2:[0m
Tweet_id = 1575906959450337281
Tweet text: ⚡️Zelensky rules out peace talks following Moscow’s annexation of huge swathes of four Ukrainian regions.
There's no way to peace talking with a terrorist State
https://t.co/aRQwsKpZlL

#WARINUKRAINE #UKRAINEWAR #UKRAINERUSSIAWAR #UKRAINE https://t.co/ci9f159F7D

[1mTweet 3:[0m
Tweet_id = 1575824812835950592
Tweet text: BREAKING: Putin says "We are ready for talks" 

#UkraineRussiaWar

[1mTweet 4:[0m
Tweet_id = 1575169966126800896
Tweet text: As you can see, Azerbaijan's peace talks with Armenia are not yielding any results.- @ChairOlek

@ElnurMammadli1 
#UkraineRussiaWar #RussianArmy #PutinWarCriminal #UkraineInvasion https://t.co/sgSkbszv8g

[1mTweet 5:[0m
Tweet_id = 1575908651658641408
Tweet text: ⚡️"#Ukraine i

### 1.2. Our-score + cosine similarity

Function to implement the inverted index like before but computing the average tweet length used in **BM25 (Best Matching 25)**:

In [22]:
def create_index_bm25(lines, num_tweets):
    """
    Implement the inverted index and compute tf, df, idf, and average tweet length.

    Argument:
    lines -- collection of tweets
    num_tweets -- total number of tweets

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of tweets these keys appear in (and the positions) as values.
    tf - normalized term frequency for each term in each tweet
    df - number of tweets each term appears in
    idf - inverse tweet frequency of each term
    avg_tweet_length - average length of tweets in the corpus
    """

    index = defaultdict(list)
    tf = defaultdict(list)  # term frequencies of terms in tweets
    df = defaultdict(int)  # tweet frequencies of terms in the corpus
    tweet_id_index = dict(zip(lines['tweet_id'], lines['tweet']))  # dictionary with tweet ids and the tweets' text
    idf = defaultdict(float)
    total_tweet_length = 0

    for id, row in lines.iterrows():  # iterates through tweets dataframe

        tweet_id = row['tweet_id']
        tweet_text = row['tweet']
        terms = tweet_text

        current_tweet_index = {}

        for position, term in enumerate(terms):  # terms contain tweet_text. Loops over all terms:
            try:
                # if the term is already in the dict append the position to the corresponding list
                current_tweet_index[term][1].append(position)
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                current_tweet_index[term] = [tweet_id, array('I', [position])]

        # normalize term frequencies
        norm = 0
        for term, posting in current_tweet_index.items():
            # posting will contain the list of positions for the current term in the current tweet.
            norm += len(posting[1]) ** 2
        norm = math.sqrt(norm)

        # calculate the tf (dividing the term frequency by the above computed norm) and df weights
        for term_tweet, posting_tweet in current_tweet_index.items():
            # append the tf for the current term (tf = term frequency in the current tweet/norm)
            tf[term_tweet].append(np.round(len(current_tweet_index[term_tweet]) / norm, 4))
            # increment the tweet frequency of the current term (number of tweets containing the current term)
            df[term_tweet] += 1

        # merge the current page index with the main index
        for term_tweet, posting_tweet in current_tweet_index.items():
            index[term_tweet].append(posting_tweet)

        total_tweet_length += len(terms)

    # Compute IDF
    for term in df:
        idf[term] = np.round(np.log(float(num_tweets / df[term])), 4)

    # Calculate the average tweet length
    avg_tweet_length = total_tweet_length / num_tweets

    return index, tf, df, idf, tweet_id_index, avg_tweet_length


Index creation using bm25:

In [23]:
start_time = time.time()
index, tf, df, idf, tweet_id_index, avg_doc_length = create_index_bm25(lines, num_tweets)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 1.2 seconds


Function to perform the ranking of the results of a search based on the BM25 weights and cosine similarity

In [24]:
def rank_documents_bm25(terms, tweets, index, idf, tf, avg_doc_length):

    tweet_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)

    for termIndex, term in enumerate(terms):
        if term not in index:
            continue

        # Compute BM25 weights for the query terms
        k1 = 1.5
        b = 0.75
        tf_query = query_terms_count[term]
        idf_term = idf[term]

        query_vector[termIndex] = ((k1 + 1) * tf_query) / (k1 * ((1 - b) + b * (avg_doc_length / avg_doc_length)) + tf_query) * idf_term

        # Generate tweet_vectors for matching tweets
        for tweet_index, (tweet, postings) in enumerate(index[term]):
            if tweet in tweets:
                # BM25 Calculus
                tf_tweet = tf[term][tweet_index]
                # doc_length = len(postings[1])  # No se necesita si postings es simplemente la posición del término
                BM25_score = ((tf_tweet * (2.0 + 1.0)) / (tf_tweet + 2.0 * (1.0 - b + b * (avg_doc_length / avg_doc_length))))
                tweet_vectors[tweet][termIndex] = BM25_score

    # Calculate the score of each doc
    tweet_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in tweet_vectors.items()]
    tweet_scores.sort(reverse=True)
    result_tweets = [x[1] for x in tweet_scores]
    tweet_scores = [x[0] for x in tweet_scores]

    if len(result_tweets) == 0:
        print("No results found, try again")
        query = input()
        tweets = search_bm25(query, index, idf, tf, avg_doc_length)

    return result_tweets, tweet_scores


Function to perform the ranking of the results of a search based on BM25 algorithm:

In [25]:
def search_bm25(query, index, avg_doc_length):
    query = build_terms(query)
    tweets = set()
    for term in query:
        try:
            term_tweets = [posting[0] for posting in index[term]]
            tweets = tweets.union(term_tweets)
        except:
            pass
    tweets = list(tweets)
    ranked_tweets, tweet_scores = rank_documents_bm25(query, tweets, index, idf, tf, avg_doc_length)
    return ranked_tweets, tweet_scores


### 1.2.1 Ranking Our-score

#### Query 1:


In [26]:
ranked_tweets, tweet_scores = search_bm25(query1, index, avg_doc_length)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 1:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1


Top 5 results out of 125 for the query 1:
[0m
[1mTweet 1:[0m
Tweet_id = 1575768808337412098
Tweet text: #UkraineRussiaWar #UK #UnitedKingdom #GreatBritain #Kyiv #SupportUkraine #StandWithUkraine British defense minister paid secret visit to Kyiv this week https://t.co/AyeLy3StYb

[1mTweet 2:[0m
Tweet_id = 1575827125159940096
Tweet text: Russian President Putin calls on Kyiv to 'immediately' cease military action 
https://t.co/duGzYIEnuR 
#Putin #Russian #UkraineRussiaWar https://t.co/xMaaRgA5Hw

[1mTweet 3:[0m
Tweet_id = 1575827101030064128
Tweet text: #UkraineRussiaWar | Russian President Vladimir Putin calls on Kyiv to 'immediately' cease military action | reported by news agency AFP

[1mTweet 4:[0m
Tweet_id = 1575905170952982529
Tweet text: ⚡️BREAKING: After Putin's unilateral declaration of the annexation of occupied Ukraine, President Volodymyr Zelensky has announced that Kyiv has formally requested to join NATO.
https://t.co/aRQwsKpZlL

#WARINUKRAINE #UKRAINEWAR #UKRAIN

#### Query 2:

In [27]:
ranked_tweets, tweet_scores = search_bm25(query2, index, avg_doc_length)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 2:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1



Top 5 results out of 1034 for the query 2:
[0m
[1mTweet 1:[0m
Tweet_id = 1575669419551862784
Tweet text: No fly zones, loss of overseas Russian assets with financial compensation to Ukraine, with increased support from countries on the side lines atm.
#Ukraine #UkraineWar #UkrainianArmy #UkraineRussiaWar #UkraineWillWin #Russia #Russian #RussianArmy https://t.co/YIr358oh2z

[1mTweet 2:[0m
Tweet_id = 1575784366957301760
Tweet text: #DOPPELGANGER: How Russia-based actors cloned legitimate media outlets from multiple countries (🇩🇪🇬🇧🇫🇷🇮🇹🇱🇻🇺🇦) to spread #disinformation designed to undermine the support for #Ukraine. #UkraineRussiaWar 

@DisinfoEU last investigation: https://t.co/lydkTWOgW6

[1mTweet 3:[0m
Tweet_id = 1575902847216095232
Tweet text: Is our financial support going to the right cause to help Ukraine people? Or going to the pockets of Zelensky?  We saw his Caribean tax heaven secret accounts been exposed by Pandora paper, &amp;NATO sent weapons been sold to 3rd countries

#### Query 3:

In [28]:
ranked_tweets, tweet_scores = search_bm25(query3, index, avg_doc_length)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 3:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1



Top 5 results out of 937 for the query 3:
[0m
[1mTweet 1:[0m
Tweet_id = 1575757996663377920
Tweet text: 30.09.22.
Zaporizhzhia, Ukraine. 

Those people are volunteers who wanted to give some humanitarian aid for people there and leave town with their relatives. But russia had another plan. 

#russiaisateroriststate #RussiaIsANaziState #Zaporizhzhia #Ukraine #UkraineRussiaWar #war https://t.co/KAphafHmGC

[1mTweet 2:[0m
Tweet_id = 1575764098260029440
Tweet text: The people in the convoy had planned to travel to the Russian-occupied part of #Zaporizhzhia to pick up their relatives and deliver humanitarian aid.
#RussianUkrainianWar #Ukraine #StandWithUkraine #civiliancasualties #UkraineRussiaWar #UkraineCrisis #PutinWarCriminal https://t.co/IKbZRk6dTH

[1mTweet 3:[0m
Tweet_id = 1575722456823738370
Tweet text: ❗Russian missile attack on the humanitarian convoy from Zaporizhzhia to occupied territories. People were heading to rescue their relatives left in russia-controlled areas an

#### Query 4:

In [29]:
ranked_tweets, tweet_scores = search_bm25(query4, index, avg_doc_length)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 4:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1



Top 5 results out of 974 for the query 4:
[0m
[1mTweet 1:[0m
Tweet_id = 1575639975567073280
Tweet text: Opinion | Russians flee as Putin announces Ukraine annexation - The Washington Post https://t.co/zFZnMnLdlV #Ukraine  #UkraineRussiaWar #UkraineUnderAttack #UkraineWarNews

[1mTweet 2:[0m
Tweet_id = 1575517390724538372
Tweet text: Draft Dodgers Slam Vladimir Putin’s War in Ukraine After Fleeing Russia https://t.co/dZY108A52g #Ukraine  #UkraineRussiaWar #UkraineUnderAttack #UkraineWarNews

[1mTweet 3:[0m
Tweet_id = 1575244207463145474
Tweet text: Russians flee to Kazakhstan to avoid call-up for war in Ukraine.

#Russians #RussianMobilization #RussiaUkraineWar #UkraineRussiaWar #Ukraina #Kazakhstan https://t.co/Zy67cjlB6a

[1mTweet 4:[0m
Tweet_id = 1575289432869163008
Tweet text: Reportedly, more people flee Ukraine in one week than all the Russians that have left their country because they don’t want to fight in the war. #ukrainerussia #UkraineRussiaWar

[1mTweet 5:[0m
Twe

#### Query 5:

In [30]:
ranked_tweets, tweet_scores = search_bm25(query5, index, avg_doc_length)
top = 5

print("\n\033[1m======================\nTop {} results out of {} for the query 5:\n\033[0m".format(top, len(ranked_tweets)))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1



Top 5 results out of 591 for the query 5:
[0m
[1mTweet 1:[0m
Tweet_id = 1575906959450337281
Tweet text: ⚡️Zelensky rules out peace talks following Moscow’s annexation of huge swathes of four Ukrainian regions.
There's no way to peace talking with a terrorist State
https://t.co/aRQwsKpZlL

#WARINUKRAINE #UKRAINEWAR #UKRAINERUSSIAWAR #UKRAINE https://t.co/ci9f159F7D

[1mTweet 2:[0m
Tweet_id = 1575916840286507009
Tweet text: I don’t recall all of this peace/truce talk when people thought Russia was winning. #UkraineRussiaWar

[1mTweet 3:[0m
Tweet_id = 1575908651658641408
Tweet text: ⚡️"#Ukraine is ready to talk with Russia, but only with a different Russian president."
Zelensky words after Putin’s speech formalizing the annexation, in which he asked Kyiv for negotiations to end the war
https://t.co/aRQwsKH2nL

#WARINUKRAINE #UKRAINEWAR #UKRAINERUSSIAWAR https://t.co/DLwKgztFqG

[1mTweet 4:[0m
Tweet_id = 1575824812835950592
Tweet text: BREAKING: Putin says "We are ready for talks

## 2. Word2vec + cosine similarity

Traing the model **Word2Vec** with our tweets:

In [31]:
tweets = tweets_df['tweet'].tolist()
model = Word2Vec(sentences=tweets, vector_size=100, window=5, min_count=1, workers=4)

Function to implement the inverted index and compute word vectors using Word2Vec

In [32]:
def create_index_word2vec(lines, model):

    index = defaultdict(list)
    word2vector = defaultdict(list)

    for id, row in lines.iterrows():
        tweet_id = row['tweet_id']
        terms = row['tweet']

        word2vector[tweet_id] = np.mean([model.wv[word] for word in terms if word in model.wv], axis=0)

        for term in terms:
            index[term].append(tweet_id)

    return index, word2vector

In [33]:
start_time = time.time()
index, word2vector = create_index_word2vec(lines, model)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2)))

Total time to create the index: 0.59 seconds


Function to perform the ranking of the results of a search based on Word2Vec word vectors

In [34]:
def rank_documents_word2vec(query_terms, tweets, word2vector, model):
    """
    Perform the ranking of the results of a search based on Word2Vec word vectors

    Arguments:
    query_terms -- list of query terms
    tweets -- list of tweets, to rank, matching the query
    word2vector -- word vectors for each term in each tweet

    Returns:
    Print the list of ranked documents
    """

    # Compute the query_vector
    query_vector = np.array([model.wv[term] for term in query_terms if term in model.wv.key_to_index])
    query_vector = np.mean(query_vector, axis=0)

    # Calculate the score of each doc
    tweet_scores = [[np.dot(tweet_vector, query_vector), tweet] for tweet, tweet_vector in word2vector.items() if tweet in tweets] #if tweet in tweets
    tweet_scores.sort(reverse=True)

    result_tweets = [x[1] for x in tweet_scores]
    tweet_scores = [x[0] for x in tweet_scores]

    if len(tweet_scores) == 0:
      print("No results found, try again")

    return result_tweets, tweet_scores


Function to perform the ranking of the results of a search based on Word2Vec:

In [35]:
def search_word2vec(query, index, word2vector, model):
    """
    Output is the list of tweets that contain any of the query terms.
    So, we will get the list of tweets for each query term, and take the union of them.
    """
    query = build_terms(query)
    tweets = set()
    for term in query:
        try:
            # Store in term_tweets the ids of the tweets that contain "term"
            term_tweets = [tweet for tweet in index[term]]
            # Tweets = tweets Union term_tweets
            tweets = tweets.union(term_tweets)
        except:
            # Term is not in index
            pass

    tweets = list(tweets)
    ranked_tweets, tweet_scores = rank_documents_word2vec(query, tweets, word2vector, model)
    return ranked_tweets, tweet_scores

### 2.1. TOP-20 ranking Word2Vec

#### Query 1:


In [36]:
ranked_tweets, tweet_scores = search_word2vec(query1, index, word2vector, model)
top = 20

print("\n\033[1m======================\nTop {} results out of {} for the query 1: {}\n\033[0m".format(top, len(ranked_tweets), query1))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print('\033[1mTweet Score: {}\033[0m'.format(tweet_scores[count]))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1


Top 20 results out of 125 for the query 1: Presidents visiting Kyiv
[0m
[1mTweet 1:[0m
[1mTweet Score: 10.832234382629395[0m
Tweet_id = 1575860740556460032
Tweet text: 🇺🇦🇷🇺Ukraine will not hold any negotiations with Russia while Putin is president, Zelensky stated (September 30, 2022).

#Ukraine #Ukrainewar #UkraineRussiaWar #Zelensky

[1mTweet 2:[0m
[1mTweet Score: 10.792501449584961[0m
Tweet_id = 1575861873060499457
Tweet text: NOW - Ukraine's president Zelensky signs #NATO application.
#Russia #RussiaInvadedUkraine #Ukraine #UkraineRussiaWar
https://t.co/hgqk2iH3kb

[1mTweet 3:[0m
[1mTweet Score: 10.756027221679688[0m
Tweet_id = 1575835227561553920
Tweet text: Russian President Vladimir Putin Opens Signing Event at Kremlin To Annex Parts of Ukraine

#Russia #Ukraine #UkraineRussiaWar #Putin 

https://t.co/ZPAOWy7FXL

[1mTweet 4:[0m
[1mTweet Score: 10.404382705688477[0m
Tweet_id = 1575836501539454976
Tweet text: The 🇷🇺president #Putin has just signed the decree to a

#### Query 2

In [37]:
ranked_tweets, tweet_scores = search_word2vec(query2, index, word2vector, model)
top = 20

print("\n\033[1m======================\nTop {} results out of {} for the query 2: {}\n\033[0m".format(top, len(ranked_tweets), query2))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print('\033[1mTweet Score: {}\033[0m'.format(tweet_scores[count]))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1


Top 20 results out of 1034 for the query 2: Countries supporting Ukraine
[0m
[1mTweet 1:[0m
[1mTweet Score: 29.720504760742188[0m
Tweet_id = 1575858789647286272
Tweet text: Artillery For Ukraine 💪
#Russians #Russia #Ukraine #Ukrainian #UkraineWar #UkraineRussiaWar #RussiaIsATerroristState https://t.co/OyVPWJCXzs

[1mTweet 2:[0m
[1mTweet Score: 29.49874496459961[0m
Tweet_id = 1575326053219946497
Tweet text: Russia Ukraine updates 
#Russia #RussiaInvadedUkraine #Ukraine #UkraineRussiaWar https://t.co/3GsEzHPKGO

[1mTweet 3:[0m
[1mTweet Score: 29.49874496459961[0m
Tweet_id = 1575887848477032461
Tweet text: #Russia #USA 
War in #Ukraine | Russian-Ukraine #NATO Video Archive
2022 Russian invasion of Ukraine

#Putin #Russian #RussianArmy #RussiaUkraineWar  #UkraineRussiaWar #Russians #Kharkiv #Zelensky #UkraineWar #Kherson #Biden
https://t.co/ksX8tsX39O

[1mTweet 4:[0m
[1mTweet Score: 29.49874496459961[0m
Tweet_id = 1575821996465586177
Tweet text: #Russia #USA 
War in #Ukra

#### Query 3

In [38]:
ranked_tweets, tweet_scores = search_word2vec(query3, index, word2vector, model)
top = 20

print("\n\033[1m======================\nTop {} results out of {} for the query 3: {}\n\033[0m".format(top, len(ranked_tweets), query3))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print('\033[1mTweet Score: {}\033[0m'.format(tweet_scores[count]))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1


Top 20 results out of 937 for the query 3: Humanitarian aid in Ukraine
[0m
[1mTweet 1:[0m
[1mTweet Score: 19.77367401123047[0m
Tweet_id = 1575858789647286272
Tweet text: Artillery For Ukraine 💪
#Russians #Russia #Ukraine #Ukrainian #UkraineWar #UkraineRussiaWar #RussiaIsATerroristState https://t.co/OyVPWJCXzs

[1mTweet 2:[0m
[1mTweet Score: 19.72146224975586[0m
Tweet_id = 1575326053219946497
Tweet text: Russia Ukraine updates 
#Russia #RussiaInvadedUkraine #Ukraine #UkraineRussiaWar https://t.co/3GsEzHPKGO

[1mTweet 3:[0m
[1mTweet Score: 19.72146224975586[0m
Tweet_id = 1575887848477032461
Tweet text: #Russia #USA 
War in #Ukraine | Russian-Ukraine #NATO Video Archive
2022 Russian invasion of Ukraine

#Putin #Russian #RussianArmy #RussiaUkraineWar  #UkraineRussiaWar #Russians #Kharkiv #Zelensky #UkraineWar #Kherson #Biden
https://t.co/ksX8tsX39O

[1mTweet 4:[0m
[1mTweet Score: 19.72146224975586[0m
Tweet_id = 1575821996465586177
Tweet text: #Russia #USA 
War in #Ukraine

#### Query 4

In [39]:
ranked_tweets, tweet_scores = search_word2vec(query4, index, word2vector, model)
top = 20

print("\n\033[1m======================\nTop {} results out of {} for the query 4: {}\n\033[0m".format(top, len(ranked_tweets), query4))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print('\033[1mTweet Score: {}\033[0m'.format(tweet_scores[count]))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1


Top 20 results out of 974 for the query 4: Citizens fleeing Ukraine
[0m
[1mTweet 1:[0m
[1mTweet Score: 21.733232498168945[0m
Tweet_id = 1575858789647286272
Tweet text: Artillery For Ukraine 💪
#Russians #Russia #Ukraine #Ukrainian #UkraineWar #UkraineRussiaWar #RussiaIsATerroristState https://t.co/OyVPWJCXzs

[1mTweet 2:[0m
[1mTweet Score: 21.696048736572266[0m
Tweet_id = 1575326053219946497
Tweet text: Russia Ukraine updates 
#Russia #RussiaInvadedUkraine #Ukraine #UkraineRussiaWar https://t.co/3GsEzHPKGO

[1mTweet 3:[0m
[1mTweet Score: 21.696048736572266[0m
Tweet_id = 1575887848477032461
Tweet text: #Russia #USA 
War in #Ukraine | Russian-Ukraine #NATO Video Archive
2022 Russian invasion of Ukraine

#Putin #Russian #RussianArmy #RussiaUkraineWar  #UkraineRussiaWar #Russians #Kharkiv #Zelensky #UkraineWar #Kherson #Biden
https://t.co/ksX8tsX39O

[1mTweet 4:[0m
[1mTweet Score: 21.696048736572266[0m
Tweet_id = 1575821996465586177
Tweet text: #Russia #USA 
War in #Ukrain

#### Query 5

In [40]:
ranked_tweets, tweet_scores = search_word2vec(query5, index, word2vector, model)
top = 20

print("\n\033[1m======================\nTop {} results out of {} for the query 5: {}\n\033[0m".format(top, len(ranked_tweets), query5))
count = 1
for t_id in ranked_tweets[:top]:
    original_tweet = original_df[original_df['tweet_id'] == t_id]
    print('\033[1mTweet {}:\033[0m'.format(count))
    print('\033[1mTweet Score: {}\033[0m'.format(tweet_scores[count]))
    print("Tweet_id = {}\nTweet text: {}\n".format(t_id, original_tweet['tweet'].values[0]))

    count += 1


Top 20 results out of 591 for the query 5: Putin and Zelensky peace talks
[0m
[1mTweet 1:[0m
[1mTweet Score: 20.311811447143555[0m
Tweet_id = 1575836603863699458
Tweet text: #UkraineRussiaWar | Putin Says "4 New Regions" As #Russia Annexes #Ukraine Territory https://t.co/65v5pd18Ma https://t.co/DQ5bROSQpY

[1mTweet 2:[0m
[1mTweet Score: 19.56490707397461[0m
Tweet_id = 1575823885098156032
Tweet text: PUTIN: THERE ARE FOUR NEW REGIONS OF RUSSIA

#Russia #RussiaInvadedUkraine #Ukraine #UkraineRussiaWar

[1mTweet 3:[0m
[1mTweet Score: 19.334819793701172[0m
Tweet_id = 1575828107721834498
Tweet text: BREAKING | Putin announces annexation of Ukrainian territory
https://t.co/5sheapVnjT
#Putin #Ukraine #UkraineRussiaWar #Russia #RussiaUkraineWar

[1mTweet 4:[0m
[1mTweet Score: 19.32992935180664[0m
Tweet_id = 1575204715473739776
Tweet text: "Putin Supporters Back “referendums” to Annex East Ukraine
Russian-Ukraine #NATO Video Archive
#Russia #Ukraine #Putin #Russian #RussianMob