# CSI 4107: Assignment 1

#### Phung, Quoc Dat (300164087)
#### Slimane-Kadi, Rami
#### Barry, Ousmane

In [1]:
# for loading data
import json
import pandas as pd

# Natural Language Toolkit for text processing
# Source: https://www.nltk.org/
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Step 1

<b>Preprocessing</b>:  Implement preprocessing functions for tokenization and stopword removal. The index terms will be all the words left after filtering out markup that is not part of the text, punctuation tokens, numbers, stopwords, etc. Optionally, you can use the Porter stemmer to stem the index words. </br>

•       Input: Documents that are read one by one from the collection</br>
•       Output: Tokens to be added to the index (vocabulary)</br>

In [2]:
# for removing common words
nltk.download('stopwords')

# for splitting text into words/sentences
nltk.download('punkt')
nltk.download('punkt_tab')

# after running this the output should be "true" (this means run successful)

[nltk_data] Downloading package stopwords to C:\Users\Ousmane
[nltk_data]     Barry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Ousmane
[nltk_data]     Barry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Ousmane
[nltk_data]     Barry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# loads a predefined list of common stopwords (like "the", "is", "and", "in")
# step 1 instruction says we can use Porter Stemmer (reduces words to their root form, e.g jumping -> jump)
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
print(stop_words)
print(len(stop_words))

{'now', 'which', "mustn't", 'his', 'you', 'theirs', 'through', 'd', 'there', 'shan', 'won', 'has', 'not', 'myself', 'when', "don't", 'after', 'any', 'm', "you'll", 'mightn', 'the', "won't", 'between', 'needn', 'an', 'until', 'than', 'same', 'having', "wasn't", 't', 'isn', 'more', 'their', 'whom', 'it', "haven't", 'should', 'himself', 'aren', 'doesn', 'and', "wouldn't", 'further', 'she', "didn't", 'we', 'too', 'themselves', 'they', 'o', 'some', 'been', 'him', "mightn't", "it's", 'yourselves', 'this', 'do', 'down', 'ma', 'for', 'being', "hadn't", 'am', 'doing', 'during', "you're", 'ourselves', 'here', 'few', 'nor', 'while', 'because', 'hasn', 'wouldn', 'your', 'be', 'a', 'itself', 'did', 'about', 'those', 'on', 'hers', 'but', "should've", 'that', 'shouldn', 'mustn', 'with', 'above', 'them', 'under', 'y', 'by', 'does', "hasn't", "isn't", "shouldn't", 'only', 'in', 'these', 'if', 'of', 'out', 'off', 'haven', 'again', 'or', 'ain', "couldn't", 'each', 'so', 'own', 'at', 'against', 'he', 'our

In [4]:
# professor said to also add the stopwords from https://www.site.uottawa.ca/~diana/csi4107/StopWords
# add it for a comprehensive list
# I downloaded the file into "stopwords.txt"
def load_stopword_file(filepath):
    words = set()
    with open(filepath, "r", encoding="utf-8") as file:
        for line in file:
            word = line.strip().lower()
            # Ignore empty lines
            if word:  
                words.add(word)
    return stop_words

In [5]:
stop_words_from_file = load_stopword_file("from_professor/stopwords.txt")
stop_words.update(stop_words_from_file)
print(stop_words)
print(len(stop_words))

{'now', 'which', "mustn't", 'his', 'you', 'theirs', 'through', 'd', 'there', 'shan', 'won', 'has', 'not', 'myself', 'when', "don't", 'after', 'any', 'm', "you'll", 'mightn', 'the', "won't", 'between', 'needn', 'an', 'until', 'than', 'same', 'having', "wasn't", 't', 'isn', 'more', 'their', 'whom', 'it', "haven't", 'should', 'himself', 'aren', 'doesn', 'and', "wouldn't", 'further', 'she', "didn't", 'we', 'too', 'themselves', 'they', 'o', 'some', 'been', 'him', "mightn't", "it's", 'yourselves', 'this', 'do', 'down', 'ma', 'for', 'being', "hadn't", 'am', 'doing', 'during', "you're", 'ourselves', 'here', 'few', 'nor', 'while', 'because', 'hasn', 'wouldn', 'your', 'be', 'a', 'itself', 'did', 'about', 'those', 'on', 'hers', 'but', "should've", 'that', 'shouldn', 'mustn', 'with', 'above', 'them', 'under', 'y', 'by', 'does', "hasn't", "isn't", "shouldn't", 'only', 'in', 'these', 'if', 'of', 'out', 'off', 'haven', 'again', 'or', 'ain', "couldn't", 'each', 'so', 'own', 'at', 'against', 'he', 'our

In [6]:
# we need this function because the instruction asks us to remove numbers in preprocessing
def is_number(s):
    try:
        # try converting to float
        float(s)  
        return True
    except ValueError:
        return False

In [7]:
def preprocessing(text):
    # 1. Tokenization: Split text into words
    # e.g input "I love reading books in 2025!" -> output ["I", "love", "reading", "books", "in" "2025", "!"]
    tokens = word_tokenize(text)  

    # 2. Lowercase
    # ["I", "love", "reading", "books", "!"] ->  ["i", "love", "reading", "books", "in", "2025", "!"]
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].strip()
        tokens[i] = tokens[i].lower()
    
    # 3. Remove punctuation (and empty strings):
    # e.g ["i", "love", "reading", "books", "!"] -> ["i", "love", "reading", "books", "in", "2025"]
    no_punctuation_tokens = []
    for word in tokens:
        if len(word) > 0 and word.isalpha():
            no_punctuation_tokens.append(word)

    # 4. Remove stopwords
    # ["i", "love", "reading", "books"] -> ["love", "reading", "books", "2025"]
    no_stopwords_tokens = []
    for word in no_punctuation_tokens:
        if word not in stop_words:
            no_stopwords_tokens.append(word)

    # 5. instruction says to remove numbers as well
    # ["love", "reading", "books", "2025"] -> ["love", "reading", "books"]
    text_tokens = []
    for word in no_stopwords_tokens:
        if not is_number(word):
            text_tokens.append(word)

    # 6. Stemming
    # e.g ["love", "reading", "books"] -> ["love", "read", "book"]
    stemmed_tokens = []
    for word in text_tokens:
        stemmed_word = stemmer.stem(word)
        stemmed_tokens.append(stemmed_word)

    return stemmed_tokens

In [8]:
# Test case:
print(preprocessing("I love reading books in 2025! ''   "))

['love', 'read', 'book']


### Loading Files

In [9]:
# File paths
corpus_file = "scifact/corpus.jsonl"
queries_file = "scifact/queries.jsonl"
test_file = "scifact/qrels/test.tsv"

# columns from each file:
# corpus.jsonl: _id, title, text, metadata
corpus_columns = ["_id", "title", "text", "metadata"]

# queries.jsonl: _id, text, metadata
queries_columns = ["_id", "text", "metadata"]

# test.tsv: query-id, corpus-id, score
test_columns = ["query-id", "corpus-id", "score"]

In [10]:
def load_jsonl_file(file_path, columns):
    "load jsonl file then return dataframe"
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data, columns=columns)

In [11]:
df_corpus = load_jsonl_file(corpus_file, corpus_columns)
df_corpus.head()

Unnamed: 0,_id,title,text,metadata
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,{}
1,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,{}
2,7912,"BC1 RNA, the transcript from a master gene for...",ID elements are short interspersed elements (S...,{}
3,18670,The DNA Methylome of Human Peripheral Blood Mo...,DNA methylation plays an important role in bio...,{}
4,19238,The human myelin basic protein gene is include...,Two human Golli (for gene expressed in the oli...,{}


In [12]:
df_queries = load_jsonl_file(queries_file, queries_columns)
df_queries.head()

Unnamed: 0,_id,text,metadata
0,0,0-dimensional biomaterials lack inductive prop...,{}
1,2,1 in 5 million in UK have abnormal PrP positiv...,"{'13734012': [{'sentences': [4], 'label': 'CON..."
2,4,1-1% of colorectal cancer patients are diagnos...,{}
3,6,10% of sudden infant death syndrome (SIDS) dea...,{}
4,9,32% of liver transplantation programs required...,"{'44265107': [{'sentences': [15], 'label': 'SU..."


In [13]:
def load_tsv_file(file_path, columns):
    "Load TSV file then return df"
    return pd.read_csv(file_path, sep="\t", names=columns)

In [14]:
df_test = load_tsv_file(test_file, test_columns)
df_test.head()

Unnamed: 0,query-id,corpus-id,score
0,query-id,corpus-id,score
1,1,31715818,1
2,3,14717500,1
3,5,13734012,1
4,13,1606628,1


### Preprocessing the Corpus

In [15]:
processed_texts = []
for i in range(len(df_corpus)):
    # title is important for query searching which is why we combine both the title and the text
    combined_text = df_corpus["title"][i] + " " + df_corpus["text"][i]
    processed_texts.append(preprocessing(combined_text))

# add this as a new column
df_corpus["processed_text"] = processed_texts

In [16]:
df_corpus.head()

Unnamed: 0,_id,title,text,metadata,processed_text
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,{},"[microstructur, develop, human, newborn, cereb..."
1,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,{},"[induct, myelodysplasia, suppressor, cell, mye..."
2,7912,"BC1 RNA, the transcript from a master gene for...",ID elements are short interspersed elements (S...,{},"[rna, transcript, master, gene, id, element, a..."
3,18670,The DNA Methylome of Human Peripheral Blood Mo...,DNA methylation plays an important role in bio...,{},"[dna, methylom, human, peripher, blood, mononu..."
4,19238,The human myelin basic protein gene is include...,Two human Golli (for gene expressed in the oli...,{},"[human, myelin, basic, protein, gene, includ, ..."


# Step 2

[10 points] Indexing: Build an inverted index, with an entry for each word in the vocabulary. </br>
You can use any appropriate data structure (hash table, linked lists, Access database, etc.). </br>
An example of possible index is presented below. Note: if you use an existing IR system, use its indexing mechanism. </br>
•       Input: Tokens obtained from the preprocessing module </br>
•       Output: An inverted index for fast access </br>

Example of inverted index   </br>
Suppose after preprocessing, </br>
doc 1: ["love", "read", "book"] </br>
doc 2: ["book", "write", "author"] </br>

Inverted index: </br>
{ </br>
    "love": {1: 1},          # "love" appears in Doc 1 (count = 1) </br>
    "read": {1: 1},          # "read" appears in Doc 1 (count = 1) </br>
    "book": {1: 1, 2: 1},    # "book" appears in Doc 1 and Doc 2 (count = 1 each) </br>
    "write": {2: 1},         # "write" appears in Doc 2 (count = 1) </br>
    "author": {2: 1}         # "author" appears in Doc 2 (count = 1) </br>
} </br>

In [17]:
import copy
import math

def build_inverted_index_from_corpus(df_corpus):
    # note that df_corpus must be processed already
    inverted_index = {} 
    
    for i in range(len(df_corpus)):
        doc_id = df_corpus["_id"][i]  
        words = df_corpus["processed_text"][i]  
        
        # loop through words
        for word in words:
            
            # if the word is new, then create an entry doc_id with count 1
            if word not in inverted_index:
                inverted_index[word] = {doc_id: 1}  
                
            # if word isn't new
            else:
                # if the word exists but doc_id is new
                if doc_id not in inverted_index[word]:
                    inverted_index[word][doc_id] = 1  
                    
                # if both word and doc_id exist, simply increment count
                else:
                    inverted_index[word][doc_id] += 1

    
    #Will contain ALL documents in corpus as keys and point to a dictionary of ALL terms in the document with their respective weights 
    #EX: {doc1: {'love':2.131232, 'read':1.35443, 'wolf':4.2745}, doc2:{'love':2.131232, 'hate':7.255}} (almost like the opposite of weighted inverted index) 
    document_vectors={}

    #Copying inverted index to weighted_inverted_index
    weighted_inverted_index = copy.deepcopy(inverted_index)

    #Will contain the highest number/count/frequency for all terms found per document
    max_frequencies = {}

    for _, doc_dict in weighted_inverted_index.items():
        for doc, count in doc_dict.items():
            if doc not in max_frequencies:
                max_frequencies[doc] = count
            else:
                max_frequencies[doc] = max(max_frequencies[doc], count)
    

    
    #N in for formula of log(N/df) (N represents total number of documents in corpus)
    sizeOfCorpus=len(df_corpus)
            
    #this part transforms the inverted_index with raw counts to an inverted index with weights
    for token in weighted_inverted_index:

        #length of the tokens value which is a dictionnary (represents number of documents token is found in)
        numOfDocsTokensAppearIn = len(weighted_inverted_index[token])
        
        if numOfDocsTokensAppearIn == 0:
            continue 
        
        #looping over the documents the token is found in and replacing raw counts to weight
        for doc in weighted_inverted_index[token]:

            tf = weighted_inverted_index[token][doc]/max_frequencies[doc]
            idf = math.log((sizeOfCorpus/numOfDocsTokensAppearIn),2)
            weight=tf*idf
            
            weighted_inverted_index[token][doc]=weight

            if doc not in document_vectors:  
                document_vectors[doc]={token:weight}

            else:
                document_vectors[doc].update({token: weight})

    #returns the weighted inverted index AND the document vectors as a tuple
    return (weighted_inverted_index, document_vectors)

In [18]:
weighted_inverted_index, document_vectors = build_inverted_index_from_corpus(df_corpus)

In [19]:
# check inverted_index
import pprint
pprint.pprint(dict(list(weighted_inverted_index.items())[:2])) # display 2 items

{'develop': {'10009203': 0.8622938718443394,
             '10068634': 0.18291082130031444,
             '10165258': 0.25150237928793234,
             '10165723': 0.2874312906147798,
             '10190778': 1.2072114205820752,
             '10273147': 0.2874312906147798,
             '1031534': 0.3353365057172431,
             '10359591': 0.3353365057172431,
             '10365749': 0.2874312906147798,
             '10374686': 0.3658216426006289,
             '10485142': 0.37725356893189854,
             '10486817': 0.44711534095632416,
             '1049501': 0.3353365057172431,
             '10504681': 1.2072114205820752,
             '10509344': 0.7545071378637971,
             '10562341': 0.16766825285862155,
             '10574949': 1.0060095171517294,
             '10608397': 0.25150237928793234,
             '10641715': 0.5030047585758647,
             '10666475': 0.3353365057172431,
             '10670430': 0.6706730114344862,
             '10692412': 0.16766825285862155,
     

In [21]:
# display the keys
print(list(weighted_inverted_index.keys()))

['microstructur', 'develop', 'human', 'newborn', 'cerebr', 'white', 'matter', 'assess', 'vivo', 'diffus', 'tensor', 'magnet', 'reson', 'imag', 'alter', 'architectur', 'brain', 'affect', 'cortic', 'result', 'function', 'disabl', 'line', 'scan', 'mri', 'sequenc', 'analysi', 'appli', 'measur', 'appar', 'coeffici', 'calcul', 'rel', 'anisotropi', 'delin', 'fiber', 'preterm', 'n', 'infant', 'effect', 'prematur', 'earli', 'gestat', 'studi', 'second', 'time', 'term', 'central', 'mean', 'wk', 'high', 'decreas', 'toward', 'posterior', 'limb', 'intern', 'capsul', 'similar', 'versu', 'higher', 'closer', 'birth', 'greater', 'absolut', 'valu', 'show', 'p', 'lower', 'area', 'compar', 'nonmyelin', 'corpu', 'callosum', 'visibl', 'mark', 'differ', 'organ', 'data', 'indic', 'quantit', 'water', 'provid', 'insight', 'live', 'induct', 'myelodysplasia', 'suppressor', 'cell', 'myelodysplast', 'syndrom', 'md', 'stem', 'malign', 'share', 'biolog', 'featur', 'activ', 'adapt', 'immun', 'respons', 'ineffect', 'hem

# Step 3

[10 points] Retrieval and Ranking:  Use the inverted index (from step 2) to find the limited set of documents that contain at least one of the query words. </br>
Compute the cosine similarity scores between a query and each document.</br> 
•       Input: One query and the Inverted Index (from Step2)</br>
•       Output: Similarity values between the query and each of the documents. Rank the documents in decreasing order of similarity scores.</br>


Run your system on the set of test queries.  Include the output in your submission as a file named Results.  [10 points]
The file should have the following format, for the top-100 results for each query/topic (the queries should be ordered in ascending order):
query_id Q0 doc_id rank score tag
where: query_id is the topic/query number, Q0 is an unused field (the literal 'Q0'), docno is  the document id taken from the doc_id field of the segment, rank is the rank assigned by your system to the segment (1 is the highest rank), score is  the computed degree of match between the segment and the topic, and tag is a unique identifier you chose for this run (same for every topic and segment). 

Example:

1 Q0 doc_id1 1 0.8032 run_name

1 Q0 doc_id2 0.7586 run_name

1 Q0 doc_id3 3 0.6517 run_name

…

 
The relevance feedback file (expected solution) contains one or more relevant cdocuments for each query (any other documents are considered non-relevant).</br>
Example:   query-id          doc-id              score
1                      31715818        1
3                      14717500        1
5                      13734012        1
…

In [22]:
import math

def query_vector_maker_and_retrieval(query, weighted_inverted_index, df_corpus):
    
    #N in for formula of log(N/df) (N represents total number of documents in corpus)
    sizeOfCorpus=len(df_corpus)
    
    #Take the query row and extract the "text" portion of it. We then pass it to the preprocessing function defined above to get the tokens
    query_tokens = preprocessing(query['text'])

    #Constructing the query vector by assigning weights to every Term in the query 
    #tf_query = count_of_term/max frequency in query
    #idf_query= log(size of corpus / number of documents the query's term appears in)
    #weight= tf_query * idf_query
    
    #Find token with highest frequency in query to use in tf formula
    max_query_count = max([query_tokens.count(token) for token in query_tokens])

    #Will be the query vector (query tokens and their weights)
    query_vector={}

    for token in query_tokens:
        tf_query = query_tokens.count(token)/max_query_count

        if token in weighted_inverted_index:
            numOfDocsQueryTokensAppearIn = len(weighted_inverted_index[token])
            idf_query = math.log(sizeOfCorpus / numOfDocsQueryTokensAppearIn, 2)
            query_vector[token] = tf_query * idf_query

        else:
            query_vector[token]=0


    #Creating the retrieved documents dictionnary which will contain ALL documents (that contain 1 or more tokens from the query), the tokens, and the weights  
    #EX/FORMAT:query={A,B,C,D} => { doc1:{A:1.1, B:2.23}, doc2:{B:2.34, C:3.02}, doc3:{A:3.53, B:1.134, C:2.243} }
    retrieved_docs={}
    
    for term in query_tokens:
        if term in weighted_inverted_index:
            for doc_id, weight in weighted_inverted_index[term].items():
                if doc_id not in retrieved_docs:
                    retrieved_docs[doc_id] = {}
                
                retrieved_docs[doc_id][term] = weight

    #here for debugging purposes
    # print(query_tokens)
    # pprint.pprint(dict(list(query_vector.items())[:50])) # display 2 items
    # print(len(retrieved_docs))

    #returns the retrieved_docs dictionary AND the query vector as a tuple
    return (retrieved_docs, query_vector)

In [23]:
retrieved_docs, query_vector=query_vector_maker_and_retrieval(df_queries.iloc[100], weighted_inverted_index, df_corpus)

In [24]:
# check inverted_index
import pprint
pprint.pprint(dict(list(retrieved_docs.items())[:50])) # display 2 items

{'10463997': {'bariatr': 0.6342957185533858,
              'reduc': 0.19139827042533727,
              'surgeri': 0.8423754546479958},
 '10577574': {'surgeri': 1.1793256365071942},
 '10582939': {'reduc': 0.1786383857303148, 'surgeri': 0.39310854550239804},
 '11117498': {'surgeri': 0.3685392614084982},
 '1156322': {'reduc': 0.24359779872315657, 'surgeri': 0.5360571075032701},
 '11718220': {'reduc': 0.8038727357864166, 'surgeri': 0.5896628182535971},
 '11933721': {'surgeri': 1.4741570456339927},
 '13514898': {'surgeri': 4.717302546028777},
 '13625993': {'surgeri': 1.0721142150065401},
 '14118484': {'cancer': 1.3523803141064907, 'surgeri': 2.9483140912679855},
 '1428840': {'cancer': 0.8452376963165567,
             'reduc': 0.16747348662217013,
             'surgeri': 0.3685392614084982},
 '15559582': {'cancer': 1.159183126376992, 'surgeri': 0.8423754546479958},
 '16390264': {'cancer': 2.1251690650244854,
              'colorect': 1.3836275633620407,
              'surgeri': 0.84237545464

In [25]:
import math

def ranking(retrieved_docs, query_vector, document_vectors):
    
    #will contain the results of cosine similarity calculations 
    results_dict={}
    
    #will hold length of query and document vectors (euclidean norm)
    query_length=0
    doc_length=0

    #can calculate query vector length here since we only have once query and multiple documents
    query_length = math.sqrt(sum(val**2 for val in query_vector.values()))

    for docs in retrieved_docs:

        #perforing dot product here (Ex: query_vector={x1,x2,x3} and doc_vector= {y1,y2,y3}. query_vector*doc_vector= x1*y1+x2*y2+x3*y3)
        cos_sim_numer = sum(query_vector.get(token, 0) * retrieved_docs[docs].get(token, 0) for token in query_vector)

        #calculating document vector length here
        doc_length = math.sqrt(sum(value**2 for value in document_vectors[docs].values()))
        
        cos_sim_denom = query_length*doc_length

        if cos_sim_denom!=0:
            results_dict[docs] = (cos_sim_numer/cos_sim_denom)
        else:
            results_dict[docs] = 0
        
    #result_dict has to now be ranked in descending order (highest score to lowest) and we need to display top 100 results (I think)
    return results_dict

        

In [26]:
x=ranking(retrieved_docs, query_vector, document_vectors)

In [27]:
print(x)

{'79447': 0.09052084488578858, '5824985': 0.612901904704833, '7627167': 0.3065403680732203, '10463997': 0.0882874110265812, '18872233': 0.46144083636051253, '19071857': 0.2615209831447685, '29022271': 0.5752712918059889, '39187170': 0.06199044487832655, '40949706': 0.34908090559479077, '41790911': 0.07630252760745644, '43220289': 0.5005637212564299, '654735': 0.02714927955085694, '750781': 0.016475975104028127, '1156322': 0.016282135864845528, '1428840': 0.0399325631514954, '1905095': 0.018280667784643552, '2015126': 0.1452457188457295, '2391552': 0.03842589860828622, '2828460': 0.025384267676776026, '3690068': 0.014801566466319438, '4200695': 0.050299340064341815, '4687948': 0.02143459289179157, '5839365': 0.029872551832201187, '5884524': 0.02227219847145964, '6477536': 0.053074913269465326, '6580081': 0.08878889696165014, '6751418': 0.019256061681052386, '6767133': 0.06903992427920823, '7583104': 0.018709809103619718, '7655029': 0.014036542740539972, '7813993': 0.02446467744221123, '

In [36]:
# We run all queries, rank the documents for each query and write the results to a file
def run_queries_and_write_to_file(document_vectors, weighted_inverted_index, df_queries, df_corpus):
  res = []

  # We retrieve the documents and the query vector for each query
  for _, row in df_queries.iterrows():
    retrieved_docs, query_vector=query_vector_maker_and_retrieval(row, weighted_inverted_index, df_corpus)
    results=ranking(retrieved_docs, query_vector, document_vectors)
    sorted_results = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))
    res.append([row["_id"], sorted_results])

  print(res[0])
  # We write the results to a file but for each query only the first 100 results
  with open("results.txt", "w") as file:
      file.write("query_id\tQ0\tdoc_id\trank\tscore\ttag\n")
      for _, r in enumerate(res):
          for j, (doc_id, score) in enumerate(r[1].items()):
              if j >= 100:
                  break
              file.write(f"{r[0]} Q0 {doc_id} {j} {score:.5f} tag_{r[0]}_{doc_id}\n")


In [37]:
run_queries_and_write_to_file(document_vectors, weighted_inverted_index, df_queries, df_corpus)

['0', {'26731863': 0.1159189866670982, '13231899': 0.09912187582562139, '10906636': 0.08642546776619163, '26071782': 0.08137343773253128, '994800': 0.08071440063188834, '42421723': 0.08027751494541922, '35008773': 0.07532247890790099, '12156187': 0.06769484668936716, '21439640': 0.0669552245289331, '42731834': 0.06396165009538923, '7581911': 0.060602944537893125, '1855679': 0.05872322548255615, '825728': 0.05815180093723733, '10786948': 0.05745592925185387, '18953920': 0.05581818270959044, '23244529': 0.05565313430189793, '14827874': 0.05488489280258661, '2566674': 0.053855710487085325, '6227220': 0.053685586796759575, '8185080': 0.05336486921945724, '21257564': 0.05244647579215285, '8417211': 0.052050933194069296, '3203590': 0.051851331412389325, '515489': 0.05092889549497683, '37949139': 0.05087616821097931, '35760786': 0.05087541675117924, '36480032': 0.05058009378272123, '31882215': 0.0503478788389302, '7399084': 0.05008036739622793, '28138927': 0.049626539361070206, '12824568': 0.