# CSI 4107: Assignment 1

#### Phung, Quoc Dat (300164087)
#### Slimane-Kadi, Rami
#### Barry, Ousmane

In [360]:
# for loading data
import json
import pandas as pd

# Natural Language Toolkit for text processing
# Source: https://www.nltk.org/
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Step 1

<b>Preprocessing</b>:  Implement preprocessing functions for tokenization and stopword removal. The index terms will be all the words left after filtering out markup that is not part of the text, punctuation tokens, numbers, stopwords, etc. Optionally, you can use the Porter stemmer to stem the index words. </br>

•       Input: Documents that are read one by one from the collection</br>
•       Output: Tokens to be added to the index (vocabulary)</br>

In [363]:
# for removing common words
nltk.download('stopwords')

# for splitting text into words/sentences
nltk.download('punkt')
nltk.download('punkt_tab')

# after running this the output should be "true" (this means run successful)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qdatp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qdatp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\qdatp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [364]:
# loads a predefined list of common stopwords (like "the", "is", "and", "in")
# step 1 instruction says we can use Porter Stemmer (reduces words to their root form, e.g jumping -> jump)
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
print(stop_words)
print(len(stop_words))

{'and', 'aren', 'he', "mightn't", 'below', "you're", 'myself', 'any', 'not', 'mightn', 'was', "aren't", 'isn', 'be', 'on', 'during', 'my', 'by', 'yourself', 'from', 'whom', 'does', 'but', 'me', 'them', 'do', "it's", 'more', 've', "hasn't", 'each', 'doesn', 'these', 'yourselves', 'will', "you've", 'once', 'few', 'through', 'shouldn', 'don', 'where', 'his', 'which', 'while', 'didn', 'are', 'been', 'other', 'hasn', 'its', 'so', 'this', 'what', 'your', 'a', 'about', 'both', 'up', 'than', 'ma', 'they', 'mustn', 'or', 'those', 'll', 'being', 'with', "don't", 'ourselves', 'were', 'our', 'wasn', 'off', 'under', "wouldn't", 'very', 'as', 'at', 'itself', 's', 'am', 'why', 'here', 'did', 'just', 're', 'above', 'wouldn', 'in', 'is', "weren't", "she's", 'if', "hadn't", "you'd", 'too', "shouldn't", 'an', "needn't", "won't", 'further', 'their', 'into', "should've", 'the', 'her', 'it', 'now', 'to', 'same', 'shan', 'weren', 'all', 'that', 'hadn', 'theirs', 'doing', 'haven', 'only', 'before', 'd', 'have

In [365]:
# professor said to also add the stopwords from https://www.site.uottawa.ca/~diana/csi4107/StopWords
# add it for a comprehensive list
# I downloaded the file into "stopwords.txt"
def load_stopword_file(filepath):
    words = set()
    with open(filepath, "r", encoding="utf-8") as file:
        for line in file:
            word = line.strip().lower()
            # Ignore empty lines
            if word:  
                words.add(word)
    return stop_words

In [366]:
stop_words_from_file = load_stopword_file("from_professor/stopwords.txt")
stop_words.update(stop_words_from_file)
print(stop_words)
print(len(stop_words))

{'and', 'aren', 'he', "mightn't", 'below', "you're", 'myself', 'any', 'not', 'mightn', 'was', "aren't", 'isn', 'be', 'on', 'during', 'my', 'by', 'yourself', 'from', 'whom', 'does', 'but', 'me', 'them', 'do', "it's", 'more', 've', "hasn't", 'each', 'doesn', 'these', 'yourselves', 'will', "you've", 'once', 'few', 'through', 'shouldn', 'don', 'where', 'his', 'which', 'while', 'didn', 'are', 'been', 'other', 'hasn', 'its', 'so', 'this', 'what', 'your', 'a', 'about', 'both', 'up', 'than', 'ma', 'they', 'mustn', 'or', 'those', 'll', 'being', 'with', "don't", 'ourselves', 'were', 'our', 'wasn', 'off', 'under', "wouldn't", 'very', 'as', 'at', 'itself', 's', 'am', 'why', 'here', 'did', 'just', 're', 'above', 'wouldn', 'in', 'is', "weren't", "she's", 'if', "hadn't", "you'd", 'too', "shouldn't", 'an', "needn't", "won't", 'further', 'their', 'into', "should've", 'the', 'her', 'it', 'now', 'to', 'same', 'shan', 'weren', 'all', 'that', 'hadn', 'theirs', 'doing', 'haven', 'only', 'before', 'd', 'have

In [367]:
# we need this function because the instruction asks us to remove numbers in preprocessing
def is_number(s):
    try:
        # try converting to float
        float(s)  
        return True
    except ValueError:
        return False

In [368]:
def preprocessing(text):
    # 1. Tokenization: Split text into words
    # e.g input "I love reading books in 2025!" -> output ["I", "love", "reading", "books", "in" "2025", "!"]
    tokens = word_tokenize(text)  

    # 2. Lowercase
    # ["I", "love", "reading", "books", "!"] ->  ["i", "love", "reading", "books", "in", "2025", "!"]
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].strip()
        tokens[i] = tokens[i].lower()
    
    # 3. REmove punctuation (and empty strings):
    # e.g ["i", "love", "reading", "books", "!"] -> ["i", "love", "reading", "books", "in", "2025"]
    no_punctuation_tokens = []
    for word in tokens:
        if len(word) > 0 and word.isalpha():
            no_punctuation_tokens.append(word)

    # 4. Remove stopwords
    # ["i", "love", "reading", "books"] -> ["love", "reading", "books", "2025"]
    no_stopwords_tokens = []
    for word in no_punctuation_tokens:
        if word not in stop_words:
            no_stopwords_tokens.append(word)

    # 5. instruction says to remove numbers as well
    # ["love", "reading", "books", "2025"] -> ["love", "reading", "books"]
    text_tokens = []
    for word in no_stopwords_tokens:
        if not is_number(word):
            text_tokens.append(word)

    # 6. Stemming
    # e.g ["love", "reading", "books"] -> ["love", "read", "book"]
    stemmed_tokens = []
    for word in text_tokens:
        stemmed_word = stemmer.stem(word)
        stemmed_tokens.append(stemmed_word)

    return stemmed_tokens

In [369]:
# Test case:
print(preprocessing("I love reading books in 2025! ''   "))

['love', 'read', 'book']


### Loading Files

In [371]:
# File paths
corpus_file = "scifact/corpus.jsonl"
queries_file = "scifact/queries.jsonl"
test_file = "scifact/qrels/test.tsv"

# columns from each file:
# corpus.jsonl: _id, title, text, metadata
corpus_columns = ["_id", "title", "text", "metadata"]

# queries.jsonl: _id, text, metadata
queries_columns = ["_id", "text", "metadata"]

# test.tsv: query-id, corpus-id, score
test_columns = ["query-id", "corpus-id", "score"]

In [372]:
def load_jsonl_file(file_path, columns):
    "load jsonl file then return dataframe"
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data, columns=columns)

In [373]:
df_corpus = load_jsonl_file(corpus_file, corpus_columns)
df_corpus.head()

Unnamed: 0,_id,title,text,metadata
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,{}
1,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,{}
2,7912,"BC1 RNA, the transcript from a master gene for...",ID elements are short interspersed elements (S...,{}
3,18670,The DNA Methylome of Human Peripheral Blood Mo...,DNA methylation plays an important role in bio...,{}
4,19238,The human myelin basic protein gene is include...,Two human Golli (for gene expressed in the oli...,{}


In [374]:
df_queries = load_jsonl_file(queries_file, queries_columns)
df_queries.head()

Unnamed: 0,_id,text,metadata
0,0,0-dimensional biomaterials lack inductive prop...,{}
1,2,1 in 5 million in UK have abnormal PrP positiv...,"{'13734012': [{'sentences': [4], 'label': 'CON..."
2,4,1-1% of colorectal cancer patients are diagnos...,{}
3,6,10% of sudden infant death syndrome (SIDS) dea...,{}
4,9,32% of liver transplantation programs required...,"{'44265107': [{'sentences': [15], 'label': 'SU..."


In [375]:
def load_tsv_file(file_path, columns):
    "Load TSV file then return df"
    return pd.read_csv(file_path, sep="\t", names=columns)

In [376]:
df_test = load_tsv_file(test_file, test_columns)
df_test.head()

Unnamed: 0,query-id,corpus-id,score
0,query-id,corpus-id,score
1,1,31715818,1
2,3,14717500,1
3,5,13734012,1
4,13,1606628,1


### Preprocessing the Corpus

In [378]:
processed_texts = []
for i in range(len(df_corpus)):
    # title is important for query searching which is why we combine both the title and the text
    combined_text = df_corpus["title"][i] + " " + df_corpus["text"][i]
    processed_texts.append(preprocessing(combined_text))

# add this as a new column
df_corpus["processed_text"] = processed_texts

In [379]:
df_corpus.head()

Unnamed: 0,_id,title,text,metadata,processed_text
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,{},"[microstructur, develop, human, newborn, cereb..."
1,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,{},"[induct, myelodysplasia, suppressor, cell, mye..."
2,7912,"BC1 RNA, the transcript from a master gene for...",ID elements are short interspersed elements (S...,{},"[rna, transcript, master, gene, id, element, a..."
3,18670,The DNA Methylome of Human Peripheral Blood Mo...,DNA methylation plays an important role in bio...,{},"[dna, methylom, human, peripher, blood, mononu..."
4,19238,The human myelin basic protein gene is include...,Two human Golli (for gene expressed in the oli...,{},"[human, myelin, basic, protein, gene, includ, ..."


# Step 2

[10 points] Indexing: Build an inverted index, with an entry for each word in the vocabulary. </br>
You can use any appropriate data structure (hash table, linked lists, Access database, etc.). </br>
An example of possible index is presented below. Note: if you use an existing IR system, use its indexing mechanism. </br>
•       Input: Tokens obtained from the preprocessing module </br>
•       Output: An inverted index for fast access </br>

Example of inverted index   </br>
Suppose after preprocessing, </br>
doc 1: ["love", "read", "book"] </br>
doc 2: ["book", "write", "author"] </br>

Inverted index: </br>
{ </br>
    "love": {1: 1},          # "love" appears in Doc 1 (count = 1) </br>
    "read": {1: 1},          # "read" appears in Doc 1 (count = 1) </br>
    "book": {1: 1, 2: 1},    # "book" appears in Doc 1 and Doc 2 (count = 1 each) </br>
    "write": {2: 1},         # "write" appears in Doc 2 (count = 1) </br>
    "author": {2: 1}         # "author" appears in Doc 2 (count = 1) </br>
} </br>

In [383]:
def build_inverted_index_from_corpus(df_corpus):
    # note that df_corpus must be processed already
    inverted_index = {} 
    
    for i in range(len(df_corpus)):
        doc_id = df_corpus["_id"][i]  
        words = df_corpus["processed_text"][i]  
        
        # loop through words
        for word in words:
            
            # if the word is new, then create an entry doc_id with count 1
            if word not in inverted_index:
                inverted_index[word] = {doc_id: 1}  
                
            # if word isn't new
            else:
                # if the word exists but doc_id is new
                if doc_id not in inverted_index[word]:
                    inverted_index[word][doc_id] = 1  
                    
                # if both word and doc_id exist, simply increment count
                else:
                    inverted_index[word][doc_id] += 1
    
    return inverted_index

In [384]:
inverted_index = build_inverted_index_from_corpus(df_corpus)

In [385]:
# check inverted_index
import pprint
pprint.pprint(dict(list(inverted_index.items())[:2])) # display 2 items

{'develop': {'10009203': 3,
             '10068634': 1,
             '10165258': 1,
             '10165723': 1,
             '10190778': 3,
             '10273147': 1,
             '1031534': 1,
             '10359591': 1,
             '10365749': 1,
             '10374686': 6,
             '10485142': 3,
             '10486817': 2,
             '1049501': 1,
             '10504681': 3,
             '10509344': 3,
             '10562341': 1,
             '10574949': 3,
             '10608397': 1,
             '10641715': 1,
             '10666475': 1,
             '10670430': 1,
             '10692412': 1,
             '10692948': 5,
             '1070920': 1,
             '1071991': 1,
             '10749308': 1,
             '10761515': 1,
             '10827901': 1,
             '10852047': 1,
             '10874408': 1,
             '108886332': 1,
             '10889845': 3,
             '10931595': 1,
             '10937190': 2,
             '10976596': 2,
             '10982689'

In [386]:
# display the keys
print(list(inverted_index.keys()))

['microstructur', 'develop', 'human', 'newborn', 'cerebr', 'white', 'matter', 'assess', 'vivo', 'diffus', 'tensor', 'magnet', 'reson', 'imag', 'alter', 'architectur', 'brain', 'affect', 'cortic', 'result', 'function', 'disabl', 'line', 'scan', 'mri', 'sequenc', 'analysi', 'appli', 'measur', 'appar', 'coeffici', 'calcul', 'rel', 'anisotropi', 'delin', 'fiber', 'preterm', 'n', 'infant', 'effect', 'prematur', 'earli', 'gestat', 'studi', 'second', 'time', 'term', 'central', 'mean', 'wk', 'high', 'decreas', 'toward', 'posterior', 'limb', 'intern', 'capsul', 'similar', 'versu', 'higher', 'closer', 'birth', 'greater', 'absolut', 'valu', 'show', 'p', 'lower', 'area', 'compar', 'nonmyelin', 'corpu', 'callosum', 'visibl', 'mark', 'differ', 'organ', 'data', 'indic', 'quantit', 'water', 'provid', 'insight', 'live', 'induct', 'myelodysplasia', 'suppressor', 'cell', 'myelodysplast', 'syndrom', 'md', 'stem', 'malign', 'share', 'biolog', 'featur', 'activ', 'adapt', 'immun', 'respons', 'ineffect', 'hem