# **IRWA_LAB_PART_3**

**Ranking score:**
Given a query, we want to get the top-20 documents related to the query.

**GOAL:**
Find all the documents that contain all the words in the query and sort them by
their relevance with regard to the query.

# **0. LOADINGS**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import nltk
nltk.download('stopwords')

from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import string
import re
import pandas as pd
from collections import Counter
import math


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
docs_path = '/content/drive/MyDrive/RIAW/IRWA_data_2024/data/farmers-protest-tweets.json'
with open(docs_path) as fp:
    lines = fp.readlines()
lines = [l.strip().replace(' +', ' ') for l in lines]

tweets = [json.loads(line.strip()) for line in lines]

In [None]:
print("Total number of tweets : {}".format(len(tweets)))

Total number of tweets : 117407


In [None]:
evaluation_file_path = '/content/drive/MyDrive/RIAW/IRWA_data_2024/data/evaluation.csv'

df_evaluation = pd.read_csv(evaluation_file_path, sep=";")
df_evaluation.head()

Unnamed: 0,docId,query_id,label
0,doc_156,1.0,0.0
1,doc_1039,1.0,0.0
2,doc_1047,1.0,1.0
3,doc_1685,1.0,0.0
4,doc_2100,1.0,1.0


## **Bidirectional Mapping for the tweet’s Ids with the document ids for evaluation**

In [None]:
docs_path_map = '/content/drive/MyDrive/RIAW/IRWA_data_2024/data/tweet_document_ids_map.csv'

df = pd.read_csv(docs_path_map)


doc_to_tweet_map = dict(zip(df['docId'], df['id']))  #map doc to tweet id

tweet_to_doc_map = dict(zip(df['id'], df['docId']))  #map tweet_id to doc


doc_id = 'doc_0'
tweet_id = doc_to_tweet_map.get(doc_id)
print(f"doc_id {doc_id} have id {tweet_id}")


doc_id_10 = tweet_to_doc_map.get(1364505314586951680)
print(f"doc_id {doc_id_10} have id {1364505314586951680}")

doc_id doc_0 have id 1364506249291784198
doc_id doc_10 have id 1364505314586951680


# **1. SCORINGS**

## **TF-IDF + Cosine Similarity**

### **Propose Test Queries**

In [None]:
test_queries = ["Who supports indian law",
                "Impact of government policies on farmers"]

### **TF-IDF**

In [None]:
def build_terms(line):

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    line = re.sub(r"http\S+|www\S+|https\S+", '', line, flags=re.MULTILINE)  #remove URLs

    line = line.lower() #lowercase

    tokens = line.split() #tokenize


    processed_tokens = []
    for token in tokens:

        if token.startswith('#'):
            processed_tokens.append(token) #keep hasthags exactly as they are
        else:
            token = token.translate(str.maketrans('', '', string.punctuation)) #remove punctuations

            if token and token not in stop_words: #eliminate the stopwords
                stemmed_token = stemmer.stem(token) #stemming
                processed_tokens.append(stemmed_token)

    return processed_tokens

In [None]:
def create_inverted_index_tfidf(tweets, tweet_to_doc_map):
    """
    Implement the inverted index and compute tf, df, and idf.

    Returns:
    index - the inverted index containing terms as keys and the corresponding list of documents these keys appear in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appears in
    idf - inverse document frequency of each term
    """

    index = defaultdict(list)
    tf = defaultdict(list)
    df = defaultdict(int)
    idf = defaultdict(float)

    num_documents = len(set(tweet_to_doc_map.values()))

    for tweet in tweets:
        tweet_id = tweet['id']
        doc_id = tweet_to_doc_map.get(tweet_id)

        if doc_id is not None:
            terms = build_terms(tweet['content'])
            current_doc_index = defaultdict(list)

            # Create current document index
            for position, term in enumerate(terms):
                current_doc_index[term].append(position)

            # Normalize term frequencies
            norm = math.sqrt(sum(len(postings) ** 2 for postings in current_doc_index.values()))

            for term, postings in current_doc_index.items():
                tf_val = np.round(len(postings) / norm, 4)
                if doc_id not in index[term]:  # Avoid duplicates
                    index[term].append((doc_id, postings))
                    tf[term].append(tf_val)
                    df[term] += 1

    # Compute IDF
    for term in df:
        idf[term] = np.round(np.log(float(num_documents) / df[term]), 4)

    return index, tf, df, idf

In [None]:
from collections import defaultdict, Counter
import numpy.linalg as la

def rank_documents(query, docs, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights.

    Returns:
    List of ranked documents along with their scores
    """

    # Process query into terms and initialize vectors
    query_terms = build_terms(query)
    query_vector = [0] * len(query_terms)
    doc_vectors = defaultdict(lambda: [0] * len(query_terms))

    # Count term frequency in query and normalize
    query_terms_count = Counter(query_terms)
    query_norm = la.norm([query_terms_count[term] for term in query_terms])

    # Build the query vector and each document vector
    for term_idx, term in enumerate(query_terms):
        if term in index:
            # Calculate the normalized term frequency for the query
            query_vector[term_idx] = (query_terms_count[term] / query_norm) * idf.get(term, 0)

            # Update document vectors with term frequencies and normalize
            for doc_idx, (doc, postings) in enumerate(index[term]):
                if doc in docs:
                    tf_norm = tf[term][doc_idx]  # Already normalized in tf calculation
                    doc_vectors[doc][term_idx] = tf_norm * idf[term]

    # Calculate normalized scores for each document
    doc_scores = []
    for doc, doc_vec in doc_vectors.items():
        doc_norm = la.norm(doc_vec)
        if doc_norm != 0:
            normalized_score = np.dot(doc_vec, query_vector) / doc_norm  # Normalize with document norm
            doc_scores.append([normalized_score, doc])

    # Sort documents by their relevance scores in descending order
    doc_scores.sort(reverse=True, key=lambda x: x[0])

    # Return ordered documents and their scores
    return [doc for _, doc in doc_scores], doc_scores




In [None]:
inv_index_tfidf, tf, df, idf = create_inverted_index_tfidf(tweets, tweet_to_doc_map)


In [None]:
docs = list(set(doc for postings in inv_index_tfidf.values() for doc, _ in postings)) # Extract a unique list of document IDs (doc_id) from the inverted index

### **Ranking with Tf-idf + Cosine Similarity**

In [None]:
# Iterate over each test query
for query in test_queries:
    # Call the rank_documents function to get the ranked documents and their scores
    results, scores = rank_documents(query, docs, inv_index_tfidf, idf, tf)

    # Print the full rankings
    print(f"Ranking for query '{query}':\n")

    # Print the results (document ID, tweet ID, and content) for the top 20 documents
    for result in results[:20]:
        # Find the score corresponding to the document
        score = next((score for score, doc in scores if doc == result), None)
        # Find the content of the tweet corresponding to the document ID
        tweet_content = next((tweet['content'] for tweet in tweets if tweet['id'] == doc_to_tweet_map.get(result)), "No content available")

        print(f"Doc ID: {result}, Tweet ID: {doc_to_tweet_map.get(result)}, Score: {score}")
        print(f"Content: {tweet_content}\n")

    print("\n")


Ranking for query 'Who supports indian law':

Doc ID: doc_9681, Tweet ID: 1363444730474885128, Score: 2.7320568625122
Content: @MEAIndia @DrSJaishankar Will you also be shooting off an angry letter to the 87 America farmers union supporting Indian's farmers in their struggle against the corporate-benefiting black laws?

https://t.co/atc5QCaUNH

#FarmersProtest #MSPLawForAllCrops

Doc ID: doc_10497, Tweet ID: 1363348220445925378, Score: 2.7320568625122
Content: 87 farmer organisations and food justice groups in the United States delivered a solidarity statement in support of Indian farmers' protests against the new farm laws.

#FarmersProtest https://t.co/PYklCc7xUI

Doc ID: doc_3390, Tweet ID: 1364130568808595456, Score: 2.7320568625121995
Content: We can be proud of Ajit Singh Ji and the #FarmersProtest in 1907 whilst also supporting a peaceful and pragmatic end to the current protest. 

Remember, the Indian Supreme Court has already intervened:
- Laws on hold
- Offer to mediate 

#Pa

## **Ranking with Our Score + Cosine Similarity**

In [None]:
#tweets[0]

In [None]:
def calculate_normalized_popularity(tweets, weight_likes=1.0, weight_retweets=1.5, weight_comments=0.5, weight_quotes=0.5, weight_followers_count=0.0001):

    raw_popularity_scores = {}
    max_popularity = 0

    for tweet in tweets:
        popularity_score = (
            weight_likes * tweet['likeCount'] +
            weight_retweets * tweet['retweetCount'] +
            weight_comments * tweet['replyCount'] +
            weight_quotes * tweet['quoteCount'] +
            weight_followers_count * tweet['user']['followersCount']
        )

        if 'verified' in tweet and tweet['verified'] == 'True':
            popularity_score += 10000 #weighs the same as 10k likes or 10M followers, fair

        raw_popularity_scores[tweet['id']] = popularity_score
        max_popularity = max(max_popularity, popularity_score) #get max popularity to normalize

    #normalizing
    normalized_popularity_scores = {tweet_id: ((raw_score / max_popularity)*4 if max_popularity > 0 else 0) for tweet_id, raw_score in raw_popularity_scores.items()}

    return normalized_popularity_scores



In [None]:
# Step 2: Update calculate_combined_scores to use precomputed popularity scores
def calculate_combined_scores(query, tweets, inv_index_tfidf, idf, tf, docs, normalized_popularity_scores):
    results, scores = rank_documents(query, docs, inv_index_tfidf, idf, tf)

    combined_scores = []
    for doc in results:
        tweet_id = doc_to_tweet_map.get(doc)
        tweet = next(t for t in tweets if t['id'] == tweet_id)

        # Retrieve normalized popularity score
        normalized_popularity = normalized_popularity_scores.get(tweet_id, 0)

        # Retrieve cosine similarity score
        cosine_similarity_score = next((score for score, d in scores if d == doc), 0)

        # Combine the scores with equal weighting
        combined_score = 0.4 * normalized_popularity + 0.6 * cosine_similarity_score
        combined_scores.append((tweet_id, combined_score, cosine_similarity_score))

    # Sort tweets by combined score in descending order
    combined_scores.sort(reverse=True, key=lambda x: x[1])

    # Return the top-ranked tweets with combined scores
    return combined_scores


In [None]:
#compute normalized popularity scores for all tweets
normalized_popularity_scores = calculate_normalized_popularity(tweets)

In [None]:
#normalized_popularity_scores

In [None]:
for query in test_queries:

    ranked_tweets = calculate_combined_scores(query, tweets, inv_index_tfidf, idf, tf, docs, normalized_popularity_scores)

    # Display the top 10 ranked tweets for each query
    print(f"Top tweets for query '{query}':\n")
    i=1
    for tweet_id, score, cosine_score in ranked_tweets[:20]:

        print("\n")
        print(i)
        tweet = next(t for t in tweets if t['id'] == tweet_id)

        print(f"Tweet ID: {tweet_id}, Combined Score: {score}, Popularity Score: {normalized_popularity_scores[tweet_id]}, TF-IDF Cosine Score: {cosine_score}")
        print(f"Content: {tweet['content']}")
        print(f"Likes: {tweet['likeCount']}, Retweets: {tweet['retweetCount']}, Replies: {tweet['replyCount']}, Quotes: {tweet['quoteCount']}, Followers: {tweet['user']['followersCount']}")
        if 'verified' in tweet:
          print(f"Verified: {tweet['verified']}\n")
        else:
          print(f"Verified: False\n")

        i+=1


Top tweets for query 'Who supports indian law':



1
Tweet ID: 1361724205709066247, Combined Score: 1.6411360453961839, Popularity Score: 0.004754819722160552, TF-IDF Cosine Score: 2.7320568625121995
Content: Indian farmers are protesting new laws that will allow corporations to drive down the prices at which farmers sell produce. In return, they’re facing severe govt repression
@mujerxsrising just put out a video in support of the #FarmersProtest—please watch:
https://t.co/PWrkVKY4Nc
Likes: 19, Retweets: 15, Replies: 2, Quotes: 1, Followers: 20157
Verified: False



2
Tweet ID: 1364110026361745411, Combined Score: 1.6397202666299686, Popularity Score: 0.0012153728066223155, TF-IDF Cosine Score: 2.7320568625121995
Content: Just curious... how much money does the Indian govt have to keep paying people to show up in support of BJP? How many "farmers" have you already paid to say farm laws are good? Didnt know India had a budget for this🤡🤡 #FarmersProtest #Pagdi_Sambhal_Jatta #FarmersProt

## **Ranking with BM25**

In [None]:
k1 = 1.2
b = 0.75

def calculate_avg_doc_length(tf):
    """
    Calcula la longitud promedio de los documentos.
    """
    doc_lengths = defaultdict(int)
    for term, postings in tf.items():
        for i, freq in enumerate(postings):
            doc_lengths[i] += freq
    total_length = sum(doc_lengths.values())
    num_docs = len(doc_lengths)
    return total_length / num_docs

def bm25_score(query, docs, index, idf, tf, avg_doc_length):
    """
    Calcula el ranking BM25 para una consulta dada.

    Parameters:
    - query: La cadena de la consulta.
    - docs: Lista de documentos a considerar.
    - index: Índice invertido con términos mapeados a postings de documentos.
    - idf: Diccionario con los valores de IDF.
    - tf: Diccionario con las frecuencias de los términos en los documentos.
    - avg_doc_length: Longitud promedio de los documentos en la colección.

    Returns:
    - Lista de documentos ordenados por puntuación BM25.
    """
    query_terms = build_terms(query)
    doc_scores = defaultdict(float)

    for term in query_terms:
        if term in index:
            idf_term = idf.get(term, 0)
            for doc_idx, (doc, postings) in enumerate(index[term]):
                if doc in docs:
                    f_q_d = tf[term][doc_idx]  # Frecuencia de término en el documento
                    doc_length = len(postings)
                    score = idf_term * ((f_q_d * (k1 + 1)) / (f_q_d + k1 * (1 - b + b * (doc_length / avg_doc_length))))
                    doc_scores[doc] += score

    # Ordenar los documentos por puntaje en orden descendente
    return sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)

In [None]:
# Calculate the average document length
avg_doc_length = calculate_avg_doc_length(tf)

# Iterate over each test query for BM25
for query in test_queries:
    bm25_results = bm25_score(query, docs, inv_index_tfidf, idf, tf, avg_doc_length)

    print(f"Ranking for query '{query}':\n")

    for doc, score in bm25_results[:20]:  # Iterate over the top 20 results
        tweet_id = doc_to_tweet_map.get(doc, "Unknown")  # Get the tweet_id corresponding to the doc_id
        # Find the content of the tweet corresponding to the document ID
        tweet_content = next((tweet['content'] for tweet in tweets if tweet['id'] == tweet_id), "No content available")

        print(f"Doc ID: {doc}, Tweet ID: {tweet_id}, Score: {score}")
        print(f"Content: {tweet_content}\n")

    print("\n")


Ranking for query 'Who supports indian law':

Doc ID: doc_6913, Tweet ID: 1363727092332957697, Score: 5.825954848828388
Content: San Francisco supporting the Indian Farmers Protest, Indian govt should take back the 3 laws @amarg408 @ANI #ModiIgnoringFarmersDeath #farmersprotest https://t.co/vr88iZ8qSQ

Doc ID: doc_13857, Tweet ID: 1362992216130670593, Score: 5.722439716321816
Content: #DPstopIntimidatingFarmers

Intimidation is the Indian government unwritten #toolkit circulated to DP against the law of this land and #StandWithDishaRavi who is another victim of supporting #FarmersProtest

Doc ID: doc_26755, Tweet ID: 1361832047342419970, Score: 5.71876224203535
Content: Support indian #FarmersProtest https://t.co/lPIFKieDGd

Doc ID: doc_1862, Tweet ID: 1364288034888437761, Score: 5.318049805675303
Content: We are in support of Indian Farmers #FarmersProtest https://t.co/EL68HPR8wa

Doc ID: doc_2022, Tweet ID: 1364269631544963074, Score: 5.318049805675303
Content: We support Indian farm

## **Similarity between rankings of TF-IDF and BM25**

In [None]:
# Compare the top 20 results of each query for TF-IDF and BM25 and identify common documents
for query in test_queries:
    # Get the top 20 results for both TF-IDF and BM25
    tfidf_results, _ = rank_documents(query, docs, inv_index_tfidf, idf, tf)
    bm25_results = bm25_score(query, docs, inv_index_tfidf, idf, tf, avg_doc_length)

    # Extract document IDs
    tfidf_docs = {doc for doc in tfidf_results[:20]}
    bm25_docs = {doc for doc, _ in bm25_results[:20]}

    # Find the common document IDs
    common_docs = tfidf_docs.intersection(bm25_docs)

    # Print comparison results
    print(f"Query '{query}':")
    print(f"Number of common documents: {len(common_docs)}")
    if common_docs:
        print("Common document IDs:")
        for doc_id in common_docs:
            print(f"Doc ID: {doc_id}, Tweet Content: {next((tweet['content'] for tweet in tweets if tweet['id'] == doc_to_tweet_map.get(doc_id)), 'No content available')}")
    else:
        print("No common documents found.")
    print("\n")


Query 'Who supports indian law':
Number of common documents: 4
Common document IDs:
Doc ID: doc_42274, Tweet Content: In support of ongoing farmers protest against black laws imposed by indian government, we are meeting at Sandringham park to share our thoughts and there will be aerial display in the Auckland sky with slogan 
support indian farmer #FarmersProtest 
from 2.30pm to 4.30pm
Doc ID: doc_10497, Tweet Content: 87 farmer organisations and food justice groups in the United States delivered a solidarity statement in support of Indian farmers' protests against the new farm laws.

#FarmersProtest https://t.co/PYklCc7xUI
Doc ID: doc_9681, Tweet Content: @MEAIndia @DrSJaishankar Will you also be shooting off an angry letter to the 87 America farmers union supporting Indian's farmers in their struggle against the corporate-benefiting black laws?

https://t.co/atc5QCaUNH

#FarmersProtest #MSPLawForAllCrops
Doc ID: doc_13857, Tweet Content: #DPstopIntimidatingFarmers

Intimidation is th

# **2. WORD-2-VEC + COSINE SIMILARITY**

In [None]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
#now we are using 5 queries
test_queries = ["Who supports indian law",
                "Impact of government policies on farmers",
                "Who is breaking in farmers' protest? ",
                "what is the real issue? ",
                "who is against indian farmers"]

## **Word2Vec model**

In [None]:
# Tokenize each tweet's content
tweet_tokens = [build_terms(tweet['content']) for tweet in tweets]

# We create the Word2Vec model on tokenized tweets
model = Word2Vec(sentences=tweet_tokens, vector_size=300, window=5, min_count=1, workers=4)

## **Tweet2Vec**

In [None]:
#Function to create the vector representation of a tweet
def tweet2vec(tweet, model):
    tokens = build_terms(tweet)  # We use the build_terms function to process the tweet
    vectors = [model.wv[token] for token in tokens if token in model.wv]  # Only use tokens present in Word2Vec vocabulary
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [None]:
tweetVec = {tweet['id']: tweet2vec(tweet['content'], model) for tweet in tweets}

## **Ranking with Cosine Similarity**

In [None]:
def rank_documents_word2vec(query, tweetVec, model, top_k=20):
    query_vector = tweet2vec(query, model)  # To represent the query as a vector
    similarities = []

    # Compute cosine similarity
    for tweet_id, doc_vector in tweetVec.items():
        score = cosine_similarity([query_vector], [doc_vector])[0][0]
        similarities.append((score, tweet_id))

    # Sort by similarity score in descending order and retunr the top 20 results
    similarities.sort(reverse=True, key=lambda x: x[0])
    return similarities[:top_k]

In [None]:
# Iterate over each query and print the top 20 ranked documents with scores, document IDs, and content using Word2Vec
for query in test_queries:
    ranked_tweets = rank_documents_word2vec(query, tweetVec, model, top_k=20)

    print(f"Top 20 documents for query '{query}':\n")
    for rank, (score, tweet_id) in enumerate(ranked_tweets, start=1):
        # Find the doc_id using the tweet_id
        doc_id = next((doc for doc, tweet in doc_to_tweet_map.items() if tweet == tweet_id), "Unknown")
        tweet_content = next((tweet['content'] for tweet in tweets if tweet['id'] == tweet_id), "No content available")

        print(f"Rank: {rank}")
        print(f"Doc ID: {doc_id}, Tweet ID: {tweet_id}, Score: {score}")
        print(f"Content: {tweet_content}\n")


Top 20 documents for query 'Who supports indian law':

Rank: 1
Doc ID: doc_6913, Tweet ID: 1363727092332957697, Score: 0.9671066999435425
Content: San Francisco supporting the Indian Farmers Protest, Indian govt should take back the 3 laws @amarg408 @ANI #ModiIgnoringFarmersDeath #farmersprotest https://t.co/vr88iZ8qSQ

Rank: 2
Doc ID: doc_42274, Tweet ID: 1360532532446392320, Score: 0.9629135131835938
Content: In support of ongoing farmers protest against black laws imposed by indian government, we are meeting at Sandringham park to share our thoughts and there will be aerial display in the Auckland sky with slogan 
support indian farmer #FarmersProtest 
from 2.30pm to 4.30pm

Rank: 3
Doc ID: doc_47424, Tweet ID: 1360077710517694465, Score: 0.9624799489974976
Content: The Indian government must immediately stop its escalating crackdown on protesters, farming leaders and journalists, amid ongoing nationwide demonstrations against three recently introduced farm laws..

#MahapanchayatRev