In [76]:
import pandas as pd
import numpy as np
import json
import math
import re

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

from IPython.display import display, HTML

from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
file_path = "arxiv_subset.json"

TOP_N = 100 # top n docs retrieved by BM25
TOP_K = 50 # top k docs retrieved by SBERT

In [78]:
def clean_text(text):

        if not isinstance(text, str):
           return ""

        lemmatizer = WordNetLemmatizer()

        text = re.sub(r'[^a-zA-Z\s.,;!?]', '', text) # Removes anything, but letters and some punc

        text = re.sub(r'\s+', ' ', text.lower().strip())

        tokens = word_tokenize(text)

        tokens = [lemmatizer.lemmatize(token) for token in tokens if len(token) > 1] #heavy on memory

        return ' '.join(tokens)

In [79]:
def check_doc_content(doc):

        content = ["id", "title", "abstract", "authors", "update_date"]

        for key in content:

            if not doc.get(key, "").strip(): # Empty or Blank spaces

                return True

        return False

In [80]:
def load_text_data(file_path, num_papers = None):

        docs = []

        with open(file_path, "r", encoding = "utf-8") as file:

            for i, line in enumerate(file):

                paper = json.loads(line)  # Convert JSON string to dictionary

                if check_doc_content(paper):
                    continue

                doc = dict(
                    id  =  paper["id"],
                    title =  paper["title"],
                    abstract = paper["abstract"],
                    authors = paper["authors"],
                    publication_date =  paper["update_date"]
                )

                docs.append(doc)

                if num_papers is not None and i == num_papers - 1:
                    break

        return docs

In [81]:
def process_text(documents):

        text_data = []

        for doc in documents:

            text = doc["title"] + " " + doc["abstract"]

            text = clean_text(text)

            text_data.append(text)

        vectorizer = CountVectorizer(stop_words = 'english')

        documents_vectorized = vectorizer.fit_transform(text_data) # Term count across docs
        vocabulary = vectorizer.get_feature_names_out() # Creating the term vocabulary

        # Matrix with doc ids as rows and vocabulary terms as cols
        dataframe = pd.DataFrame(documents_vectorized.toarray(), columns = vocabulary)

        return  text_data, vocabulary, dataframe

In [82]:
def process_query(query, vocabulary):

        query = clean_text(query)

        vectorizer = CountVectorizer(stop_words = 'english', vocabulary = vocabulary)

        q_terms = vectorizer.build_analyzer()(query)

        filtered_tokens = [term for term in q_terms if term in vocabulary]

        return filtered_tokens

In [83]:
def highlight_terms(text, query_terms):
    """Highlight query terms in text using HTML bold tags"""
    tokens = word_tokenize(text)
    highlighted = []

    for token in tokens:

        cleaned_token = clean_text(token)

        if cleaned_token in query_terms:

            highlighted.append(f"<b>{token}</b>")

        else:

            highlighted.append(token)

    return ' '.join(highlighted)


def display_results(doc_indices, scores, df, query_terms):
    """Display results in a formatted table"""
    results = []

    for idx, score in zip(doc_indices, scores):

        paper = df.iloc[idx]
        highlighted_title = highlight_terms(paper['title'], query_terms)
        highlighted_abstract = highlight_terms(paper['abstract'], query_terms)

        results.append({
            'DocID': paper['id'],
            'Title': highlighted_title,
            'Authors': ', '.join(paper['authors']),
            'Year': paper['publication_date'][:4],  # Extract year from date
            'Score': f'{score:.4f}',
            'Abstract': highlighted_abstract[:150] + '...'
        })

    return pd.DataFrame(results)

In [84]:
class BM25_Model:

    def __init__(self, documents, text_data, vocabulary, dataframe):

        self.documents = documents
        self.text_data = text_data
        self.vocabulary = vocabulary
        self.dataframe = dataframe

        self.k_1 =  1.2
        self.b = 0.75

        self.bm25_dataframe = self.calculate_scores()

# Checked!
    def calculate_scores(self):

        tfs = self.dataframe.div(self.dataframe.sum(axis = 1), axis = 0)
        dfs = (self.dataframe > 0).sum(axis = 0).to_numpy()
        idfs = np.log10(len(self.text_data) / dfs)
        #tf_idf = tfs * idfs

        dls = self.dataframe.sum(axis = 1).to_numpy()  # array of size N where N is the number of documents
        avgdl = np.mean(dls)  # single value

        numerator = np.array((self.k_1 + 1) * tfs)
        denominator = np.array(self.k_1 *((1 - self.b) + self.b * (dls / avgdl))).reshape(-1, 1) + np.array(tfs)

        BM25_tf = numerator / denominator

        BM25_score = idfs * BM25_tf

        bm25_dataframe = pd.DataFrame(BM25_score, columns = self.vocabulary)

        return bm25_dataframe

# Checked!
    def rank_documents(self, q_terms, top_n):

        q_terms_only_df = self.bm25_dataframe[q_terms]

        score_q_d = q_terms_only_df.sum(axis = 1)

        ranked_docs = sorted(zip(enumerate(self.text_data), score_q_d.values),
                     key = lambda tup:tup[1],
                     reverse = True)

        if top_n >  len(ranked_docs):

            top_n = len(ranked_docs)

        ranked_docs = [doc for doc in ranked_docs if doc[1] > 0]

        ranked_docs = ranked_docs[:top_n]

        '''for doc in ranked_docs:

          print(f'\nScore: {doc[1]:.4f}, Document {doc[0][0]}: "{doc[0][1]}"')'''

        return ranked_docs

In [85]:
class SBERT_Model:

    def __init__(self):

        self.sbert_model = SentenceTransformer('all-mpnet-base-v2')


    def rank_documents(self, ranked_docs, query, top_k):

        if top_k > len(ranked_docs):

            top_k = len(ranked_docs)

        doc_texts = [doc[0][1] for doc in ranked_docs]

        query_embedding = self.sbert_model.encode(query)

        doc_embeddings = self.sbert_model.encode(doc_texts)

        # Calculating Cosine similarity
        dot_product = np.dot(doc_embeddings, query_embedding)

        doc_norms = np.linalg.norm(doc_embeddings, axis = 1)

        query_norm = np.linalg.norm(query_embedding)

        similarities = dot_product / (doc_norms * query_norm)

        sorted_indices = np.argsort(similarities)[::-1]

        reranked_docs = [(ranked_docs[i][0], similarities[i]) for i in sorted_indices[:top_k]]

        for doc in reranked_docs:

            print(f'\nSemantic Score: {doc[1]:.4f}, Document {doc[0][0]}: "{doc[0][1]}"')

        return reranked_docs

In [86]:
documents = load_text_data(file_path)
docs_df = pd.DataFrame(documents)

text_data, vocabulary, dataframe = process_text(documents)

BM25_model = BM25_Model(documents, text_data, vocabulary, dataframe)
SBERT_model = SBERT_Model()

In [87]:
query = "deep learning"
processed_query = process_query(query, vocabulary)
print(processed_query)

ranked_docs = BM25_model.rank_documents(processed_query, TOP_N)
ranked_docs = SBERT_model.rank_documents(ranked_docs, ' '.join(processed_query), TOP_K)

['deep', 'learning']

Semantic Score: 0.3479, Document 236: "neural network approach to ordinal regression ordinal regression is an important type of learning which ha property of both classification and regression here we describe simple and effective approach to adapt traditional neural network to learn ordinal category our approach is generalization of the perceptron method for ordinal regression on several benchmark datasets our method nnrank outperforms neural network classification method compared with the ordinal regression method using gaussian process and support vector machine nnrank achieves comparable performance moreover nnrank ha the advantage of traditional neural network learning in both online and batch mode handling very large training datasets and making rapid prediction these feature make nnrank useful and complementary tool for largescale data processing task such a information retrieval web page ranking collaborative filtering and protein ranking in bioinformatics

In [88]:
ground_truth = [236, 160, 5, 904, 187]
predicted_labels = [doc[0][0] for doc in ranked_docs]
scores = [doc[1] for doc in ranked_docs]

results_df = display_results(predicted_labels, scores, docs_df, processed_query)
display(HTML(results_df.to_html(escape = False)))

Unnamed: 0,DocID,Title,Authors,Year,Score,Abstract
0,704.1028,A neural network approach to ordinal regression,"J, i, a, n, l, i, n, , C, h, e, n, g",2007,0.3479,"Ordinal regression is an important type of learning , which has properties of both classification and regression . Here we describe a simple an..."
1,704.0671,Learning from compressed observations,"M, a, x, i, m, , R, a, g, i, n, s, k, y",2016,0.3289,The problem of statistical learning is to construct a predictor of a random variable $ Y $ as a function of a related random variable $ X $ on ...
2,704.1274,Parametric Learning and Monte Carlo Optimization,"D, a, v, i, d, , H, ., , W, o, l, p, e, r, t, , a, n, d, , D, e, v, , G, ., , R, a, j, n, a, r, a, y, a, n",2011,0.2533,"This paper uncovers and explores the close relationship between Monte Carlo Optimization of a parametrized integral ( MCO ) , Parametric machine-Learn..."
3,704.166,The VVDS type-1 AGN sample : The faint end of the luminosity function,"A, ., , B, o, n, g, i, o, r, n, o, ,, , G, ., , Z, a, m, o, r, a, n, i, ,, , I, ., , G, a, v, i, g, n, a, u, d, ,, , B, ., , M, a, r, a, n, o, ,, , S, ., , P, a, l, t, a, n, i, ,, , G, ., \n, , , M, a, t, h, e, z, ,, , J, ., P, ., , P, i, c, a, t, ,, , M, ., , C, i, r, a, s, u, o, l, o, ,, , F, ., , L, a, m, a, r, e, i, l, l, e, ,, , D, ., , B, o, t, t, i, n, i, ,, , B, ., , G, a, r, i, l, l, i, ,, , V, ., \n, , , L, e, , B, r, u, n, ,, , O, ., , L, e, , F, e, v, r, e, ,, , D, ., , M, a, c, c, a, g, n, i, ,, , R, ., , S, c, a, r, a, m, e, l, l, a, ,, , M, ., , S, c, o, d, e, g, g, i, o, ,, , L, ., , T, r, e, s, s, e, ,, , G, ., \n, , , V, e, t, t, o, l, a, n, i, ,, , A, ., , Z, a, n, i, c, h, e, l, l, i, ,, , C, ., , A, d, a, m, i, ,, , S, ., , A, r, n, o, u, t, s, ,, , S, ., , B, a, r, d, e, l, l, i, ,, , M, ., , B, o, l, z, o, n, e, l, l, a, ,, \n, , , A, ., , C, a, p, p, i, ,, , S, ., , C, h, a, r, l, o, t, ,, , P, ., , C, i, l, i, e, g, i, ,, , T, ., , C, o, n, t, i, n, i, ,, , S, ., , F, o, u, c, a, u, d, ,, , P, ., , F, r, a, n, z, e, t, t, i, ,, , L, ., \n, , , G, u, z, z, o, ,, , O, ., , I, l, b, e, r, t, ,, , A, ., , I, o, v, i, n, o, ,, , H, ., J, ., , M, c, C, r, a, c, k, e, n, ,, , C, ., , M, a, r, i, n, o, n, ,, , A, ., , M, a, z, u, r, e, ,, , B, ., \n, , , M, e, n, e, u, x, ,, , R, ., , M, e, r, i, g, h, i, ,, , R, ., , P, e, l, l, o, ', ,, , A, ., , P, o, l, l, o, ,, , L, ., , P, o, z, z, e, t, t, i, ,, , M, ., , R, a, d, o, v, i, c, h, ,, , E, ., , Z, u, c, c, a, ,, \n, , , E, ., , H, a, t, z, i, m, i, n, a, o, g, l, o, u, ,, , M, ., , P, o, l, l, e, t, t, a, ,, , M, ., , B, o, n, d, i, ,, , J, ., , B, r, i, n, c, h, m, a, n, n, ,, , O, ., , C, u, c, c, i, a, t, i, ,, , S, ., , d, e, \n, , , l, a, , T, o, r, r, e, ,, , L, ., , G, r, e, g, o, r, i, n, i, ,, , Y, ., , M, e, l, l, i, e, r, ,, , P, ., , M, e, r, l, u, z, z, i, ,, , S, ., , T, e, m, p, o, r, i, n, ,, , D, ., , V, e, r, g, a, n, i, ,, \n, , , C, ., J, ., , W, a, l, c, h, e, r",2009,0.1504,"In a previous paper ( Gavignaud et al . 2006 ) , we presented the type-1 Active Galactic Nuclei ( AGN ) sample obtained from the first epoch data of t..."
4,704.1319,Using conceptual metaphor and functional grammar to explore how language used in physics affects student learning,"D, a, v, i, d, , T, ., , B, r, o, o, k, e, s, ,, , E, u, g, e, n, i, a, , E, t, k, i, n, a",2009,0.1463,This paper introduces a theory about the role of language in learning physics . The theory is developed in the context of physics students ' an...
5,704.1744,Blazar surveys with WMAP and Swift,"P, ., , G, i, o, m, m, i, ,, , M, ., , C, a, p, a, l, b, i, ,, , E, ., , C, a, v, a, z, z, u, t, i, ,, , S, ., , C, o, l, a, f, r, a, n, c, e, s, c, o, ,, , S, ., , C, u, t, i, n, i, ,, , D, ., \n, , , G, a, s, p, a, r, r, i, n, i, ,, , E, ., , M, a, s, s, a, r, o, ,, , P, ., , P, a, d, o, v, a, n, i, ,, , M, ., , P, e, r, r, i, ,, , S, ., , P, u, c, c, e, t, t, i",2009,0.1186,We present the preliminary results from two new surveys of blazars that have direct implications on the GLAST detection of extragalactic sources from ...
6,704.0042,"General System theory , Like-Quantum Semantics and Fuzzy Sets","I, g, n, a, z, i, o, , L, i, c, a, t, a",2010,0.1139,It is outlined the possibility to extend the quantum formalism in relation to the requirements of the general systems theory . It can be done by using...
7,704.16,The VIMOS VLT Deep Survey . The Assembly History of the Stellar Mass in Galaxies : from the Young to the Old Universe,"L, ., , P, o, z, z, e, t, t, i, ,, , M, ., , B, o, l, z, o, n, e, l, l, a, ,, , F, ., , L, a, m, a, r, e, i, l, l, e, ,, , G, ., , Z, a, m, o, r, a, n, i, ,, , P, ., , F, r, a, n, z, e, t, t, i, ,, \n, , , O, ., , L, e, , F, \, `, e, v, r, e, ,, , A, ., , I, o, v, i, n, o, ,, , S, ., , T, e, m, p, o, r, i, n, ,, , O, ., , I, l, b, e, r, t, ,, , S, ., , A, r, n, o, u, t, s, ,, , S, ., , C, h, a, r, l, o, t, ,, , J, ., \n, , , B, r, i, n, c, h, m, a, n, n, ,, , E, ., , Z, u, c, c, a, ,, , L, ., , T, r, e, s, s, e, ,, , M, ., , S, c, o, d, e, g, g, i, o, ,, , L, ., , G, u, z, z, o, ,, , D, ., , B, o, t, t, i, n, i, ,, , B, ., \n, , , G, a, r, i, l, l, i, ,, , V, ., , L, e, , B, r, u, n, ,, , D, ., , M, a, c, c, a, g, n, i, ,, , J, ., , P, ., , P, i, c, a, t, ,, , R, ., , S, c, a, r, a, m, e, l, l, a, ,, , G, ., , V, e, t, t, o, l, a, n, i, ,, \n, , , A, ., , Z, a, n, i, c, h, e, l, l, i, ,, , C, ., , A, d, a, m, i, ,, , S, ., , B, a, r, d, e, l, l, i, ,, , A, ., , C, a, p, p, i, ,, , P, ., , C, i, l, i, e, g, i, ,, , T, ., , C, o, n, t, i, n, i, ,, , S, ., \n, , , F, o, u, c, a, u, d, ,, , I, ., , G, a, v, i, g, n, a, u, d, ,, , H, ., , J, ., , M, c, C, r, a, c, k, e, n, ,, , B, ., , M, a, r, a, n, o, ,, , C, ., , M, a, r, i, n, o, n, i, ,, , A, ., , M, a, z, u, r, e, ,, , B, ., \n, , , M, e, n, e, u, x, ,, , R, ., , M, e, r, i, g, h, i, ,, , S, ., , P, a, l, t, a, n, i, ,, , R, ., , P, e, l, l, \, `, o, ,, , A, ., , P, o, l, l, o, ,, , M, ., , R, a, d, o, v, i, c, h, ,, , M, ., , B, o, n, d, i, ,, \n, , , A, ., , B, o, n, g, i, o, r, n, o, ,, , O, ., , C, u, c, c, i, a, t, i, ,, , S, ., , d, e, , l, a, , T, o, r, r, e, ,, , L, ., , G, r, e, g, o, r, i, n, i, ,, , Y, ., , M, e, l, l, i, e, r, ,, , P, ., \n, , , M, e, r, l, u, z, z, i, ,, , D, ., , V, e, r, g, a, n, i, ,, , C, ., , J, ., , W, a, l, c, h, e, r",2009,0.1009,We present a detailed analysis of the Galaxy Stellar Mass Function of galaxies up to z=2.5 as obtained from the VVDS . We estimate the stellar mass fr...
8,704.1182,An Optical Source Catalog of the North Ecliptic Pole Region,"N, a, r, a, e, , H, w, a, n, g, , (, 1, ), ,, , M, y, u, n, g, , G, y, o, o, n, , L, e, e, , (, 1, ), ,, , H, y, u, n, g, , M, o, k, , L, e, e, , (, 1, ), ,, , M, y, u, n, g, s, h, i, n, , I, m, \n, , , (, 1, ), ,, , T, a, e, h, y, u, n, , K, i, m, , (, 1, ), ,, , H, i, d, e, o, , M, a, t, s, u, h, a, r, a, , (, 2, ), ,, , T, a, k, e, h, i, k, o, , W, a, d, a, , (, 2, ), ,, , S, h, i, n, k, i, , O, y, a, b, u, \n, , , (, 2, ), ,, , S, o, o, j, o, n, g, , P, a, k, , (, 3, ), ,, , M, o, o, -, Y, o, u, n, g, , C, h, u, n, , (, 4, ), ,, , H, i, d, e, n, o, r, i, , W, a, t, a, r, a, i, , (, 5, ), ,, , T, a, k, a, o, \n, , , N, a, k, a, g, a, w, a, , (, 4, ), ,, , C, h, r, i, s, , P, e, a, r, s, o, n, , (, 2, ,, 6, ), ,, , T, o, s, h, i, n, o, b, u, , T, a, k, a, g, i, , (, 2, ), ,, , H, i, t, o, s, h, i, , H, a, n, a, m, i, , (, 7, ), ,, \n, , , G, l, e, n, n, , J, ., , W, h, i, t, e, , (, 8, ,, 9, ), , (, (, 1, ), , S, N, U, , K, o, r, e, a, ,, , (, 2, ), , I, S, A, S, , J, A, X, A, , J, a, p, a, n, ,, , (, 3, ), K, H, U, , K, o, r, e, a, ,, \n, , , (, 4, ), K, A, S, I, , K, o, r, e, a, ,, , (, 5, ), , O, S, A, , J, A, X, A, , J, a, p, a, n, ,, , (, 6, ), , E, S, A, , S, p, a, i, n, ,, , (, 7, ), , I, w, a, t, e, , U, n, i, v, ., , J, a, p, a, n, ,, , (, 8, ), \n, , , O, p, e, n, , U, n, i, v, ., , U, K, ,, , (, 9, ), , C, C, L, R, C, , R, A, L, , U, K, )",2009,0.0927,"We present a five ( u * , g ' , r ' , i ' , z ' ) band optical photometry catalog of the sources in the North Ecliptic Pole ( NEP ) region based on"
9,704.034,Phonon-mediated decay of an atom in a surface-induced potential,"F, a, m, , L, e, , K, i, e, n, ,, , S, ., , D, u, t, t, a, , G, u, p, t, a, ,, , a, n, d, , K, ., , H, a, k, u, t, a",2007,0.0874,We study phonon-mediated transitions between translational levels of an atom in a surface-induced potential . We present a general master equation gov...


In [89]:
def compute_metrics(true_labels, pred_labels):

  tp, fp = 0, 0
  fn = 0

  for pred in pred_labels:

    if pred in true_labels:

      tp += 1
    else:
      fp += 1

  for true in true_labels:
    if true not in pred_labels:
      fn += 1

  precision = tp / (tp + fp) if (tp + fp) > 0 else 0
  recall = tp / (tp + fn) if (tp + fn) > 0 else 0
  f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

  return precision, recall, f1_score

In [90]:
def ndcg_at_k(retrieved, relevant, k):
    """Compute nDCG at k. Here binary relevance (1 if relevant, 0 if not)."""
    dcg = 0.0

    for i, doc_id in enumerate(retrieved[:k], start=1):

        rel = 1 if doc_id in relevant else 0

        dcg += (2**rel - 1) / math.log2(i + 1)
    # Compute ideal DCG
    ideal_rel = [1] * min(len(relevant), k)

    idcg = sum((2**rel - 1) / math.log2(i + 1) for i, rel in enumerate(ideal_rel, start=1))

    if idcg == 0:

        return 0.0

    return dcg / idcg

def reciprocal_rank(retrieved, relevant):
    """Compute the reciprocal rank for a single query."""
    for i, doc_id in enumerate(retrieved, start  = 1):

        if doc_id in relevant:

            return 1.0 / i

    return 0.0

In [None]:
precision, recall, f1_score = compute_metrics(ground_truth, predicted_labels)
ndcg = ndcg_at_k(predicted_labels, ground_truth, TOP_K)
reciprocal_rank = reciprocal_rank(predicted_labels, ground_truth)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1_score}')

print(f'nDCG: {ndcg}')
print(f'Reciprocal Rank: {reciprocal_rank}')

Precision: 0.29411764705882354
Recall: 1.0
F1-score: 0.45454545454545453
Ndcg: 0.9818483242455303
Reciprocal Rank: 1.0
