In [1]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

#### Part 2 - Search for ArXiv papers using text vector similarity
Use natural language to search for research papers on ArXiv and<br>
get search results in a format that enables quick review.<br>
by vbookshelf<br>
20 Feb 2024

Part 1 - Build an ArXiv RAG search system w FAISS<br>
https://www.kaggle.com/code/vbookshelf/part-1-build-an-arxiv-rag-search-system-w-faiss

## Introduction

In Part 1 we built a RAG search system that allows us to use natural language to search for ArXiv research papers. In this notebook you can submit search queries and review the results.

We will be running a FAISS exhaustive (brute-force) search. Normally a nearest neigbors search would be used because it's faster. But I found that, even with more that 2.4 million vectors, a FAISS exhaustive search is still very fast.

Here we won't be using OpenAi to generate a natural language output because in this context that feature doesn't add alot of value.

## How to run a search

To run a search you'll need to "Copy and edit" this notebook. Then run each cell.

Please ensure that the GPU (P100) is switched on.

## Install packages

In [2]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting transformers<5.0.0,>=4.32.0
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub>=0.15.1
  Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.4/346.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting fsspec>=2023.5.0
  Downloading fsspec-2024.2.0-py3-none-any.whl (170 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.9/170.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers<0.19,>=0.14
  Downloading tokenizers-

In [3]:
#!pip install faiss-cpu
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
[0m

In [1]:
import pandas as pd
import numpy as np
import os

import json
import re

In [2]:
# The embeddings and the dataframe created and saved in Part 1

PATH_TO_EMBEDS = './all_paper_vector_embeddings/compressed_array.npz'
PATH_TO_DF = './all_paper_vector_embeddings/compressed_dataframe.csv'


In [3]:
os.listdir('../input/')

FileNotFoundError: [Errno 2] No such file or directory: '../input/'

## Helper functions

In [4]:
def run_faiss_search(query_text, top_k):

    # Run FAISS exhaustive search

    query = [query_text]

    # Vectorize the query string
    query_embedding = model.encode(query)

    # Run the query
    # index_vals refers to the chunk_list index values
    scores, index_vals = faiss_index.search(query_embedding, top_k)

    # Get the list of index vals
    index_vals_list = index_vals[0]

    return index_vals_list


def run_rerank(index_vals_list, query_text):

    chunk_list = list(df_data['prepared_text'])

    # Replace the chunk index values with the corresponding strings
    pred_strings_list = [chunk_list[item] for item in index_vals_list]

    # Format the input for the cross encoder
    # The input to the cross_encoder is a list of lists
    # [[query_text, pred_text1], [query_text, pred_text2], ...]

    cross_input_list = []

    for item in pred_strings_list:

        new_list = [query_text, item]

        cross_input_list.append(new_list)


    # Put the pred text into a dataframe
    df = pd.DataFrame(cross_input_list, columns=['query_text', 'pred_text'])

    # Save the orginal index (i.e. df_data index values)
    df['original_index'] = index_vals_list

    # Now, score all retrieved passages using the cross_encoder
    cross_scores = cross_encoder.predict(cross_input_list)

    # Add the scores to the dataframe
    df['cross_scores'] = cross_scores

    # Sort the DataFrame in descending order based on the scores
    df_sorted = df.sort_values(by='cross_scores', ascending=False)

    # Reset the index (*This was missed previously*)
    df_sorted = df_sorted.reset_index(drop=True)

    pred_list = []

    for i in range(0,len(df_sorted)):

        text = df_sorted.loc[i, 'pred_text']

        # Get the arxiv id
        # original_index refers to the index values in df_filtered
        original_index = df_sorted.loc[i, 'original_index']
        arxiv_id = df_data.loc[original_index, 'id']
        cat_text = df_data.loc[original_index, 'cat_text']
        title = df_data.loc[original_index, 'title']

        # Crete the link to the research paper pdf
        link_to_pdf = f'https://arxiv.org/pdf/{arxiv_id}'

        item = {
            'arxiv_id': arxiv_id,
            'link_to_pdf': link_to_pdf,
            'cat_text': cat_text,
            'title': title,
            'abstract': text
        }

        pred_list.append(item)

    return pred_list


def print_search_results(pred_list, num_results_to_print):

    for i in range(0,num_results_to_print):

        pred_dict = pred_list[i]

        link_to_pdf = pred_dict['link_to_pdf']
        abstract = pred_dict['abstract']
        cat_text = pred_dict['cat_text']
        title = pred_dict['title']

        print('Title:',title)
        print('Categories:',cat_text)
        print('Abstract:',abstract)
        print('Link to pdf:',link_to_pdf)
        print()


def run_arxiv_search(query_text, num_results_to_print, top_k=300):

    # Run a faiss greedy search
    pred_index_list = run_faiss_search(query_text, top_k)

    # This returns a list of dicts with length equal to top_k
    pred_list = run_rerank(pred_index_list, query_text)

    # Print the results
    print_search_results(pred_list, num_results_to_print)


## Load the embedding vectors and the dataframe

We will load the embeddings and the dataframe from the ouput of the Part 1 notebook.

In [6]:
# Load the compressed array
embeddings = np.load(PATH_TO_EMBEDS)

# Access the array by the name you specified ('my_array' in this case)
embeddings = embeddings['array_data']

embeddings.shape

(2431235, 384)

In [8]:
# Load the compressed DataFrame

#df_data = pd.read_csv(PATH_TO_DF, compression='gzip')
df_data = pd.read_csv(PATH_TO_DF)

print(df_data.shape)

#df_data.head()

  df_data = pd.read_csv(PATH_TO_DF)


(2431235, 6)


## Initialize the packages

In [9]:
# Initialize FAISS

import faiss

embed_length = embeddings.shape[1]

faiss_index = faiss.IndexFlatL2(embed_length)

# Add the embeddings to the index
faiss_index.add(embeddings)

faiss_index.is_trained

True

In [10]:
# Initialize sentence_transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")



AttributeError: module 'numpy.linalg._umath_linalg' has no attribute '_ilp64'

In [None]:
# Initialize the cross_encoder for reranking

from sentence_transformers import CrossEncoder

# We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
# ---------------------------------- #
# RUN A SEARCH

## Run a search

You'll notice that the title is still appended to the abstract. I left it in as a visual check to ensure that I haven't made any errors when displaying the results.

1- You might improve the search results by
providing more details in your search query.<br>
2- I suggest that you enter a similar search query on the ArXiv website to compare the search results and the user experience.<br>
https://arxiv.org/search/advanced

In [None]:
# *** PLEASE ENTER YOUR SEARCH QUERY HERE ***

query_text = """

I want to build an invisibility cloak like the one in Harry Potter.

"""


# RUN THE SEARCH
num_results_to_print = 20 # top_k = 300
run_arxiv_search(query_text, num_results_to_print)