### Ingesting embeddings into Faiss index

In [1]:
import faiss
import json
import numpy as np
import os
import csv
from pathlib import Path
from tqdm import tqdm

Parsing through the JSONL files, extracting the bioBERT embeddings and the corresponding PMIDs. Creating a Faiss index with the embeddings and a CSV file storing PMIDs with the corresponding Faiss index id. 

In [3]:
# Directory setup
index_directory = "/home/ubuntu/data/faiss_indices/medCPT"
index_file = "medCPT_index.index"
index_path = os.path.join(index_directory, index_file)
csv_file = "/home/ubuntu/data/faiss_indices/medCPT/medCPT_pmids.csv"

# Ensure the index directory exists
if not os.path.exists(index_directory):
    os.makedirs(index_directory)

# Dimensions of the embeddings
d = 768

# Initialize the Faiss index (Flat L2-Index)
index = faiss.IndexFlatL2(d)

# Initialize the CSV file for PMIDs
csv_path = csv_file
csv_rows = []

# Collecting all JSONL files in the current directory
source_directory = Path('/home/ubuntu/data/pubmed_medCPT')

# Retrieve and sort the files based on their numerical order in filenames
sorted_files = sorted(source_directory.glob('*.jsonl'), key=lambda x: int(x.stem.split('n')[-1]))

# Processing sorted files with progress display
for file_name in tqdm(sorted_files, desc="Processing JSONL files"):
    with open(file_name, 'r') as file:
        for line in file:
            try:
                data = json.loads(line)
                embeddings = data.get('embedding')
                pmid = int(data.get('PMID'))
                
                # If embeddings and PMID are present, add them to the index
                if embeddings and pmid:
                    embeddings = np.array(embeddings, dtype='float32').reshape(1, -1)  # Convert to NumPy array and reshape
                    index.add(embeddings)
                    
                    # Add PMIDs, filenames, and index numbers for ordering to the CSV
                    index_num = index.ntotal - 1  # Index number of the last added embedding
                    csv_rows.append([pmid, file_name.name, index_num])
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {file_name}: {e}")

# Write the index to a file
faiss.write_index(index, index_path)

print(f"Index successfully written to: {index_path}")

# Write PMIDs to CSV file
with open(csv_path, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['PMID', 'Filename', 'Index'])
    csv_writer.writerows(csv_rows)

print(f"CSV file successfully written to: {csv_path}")

Processing JSONL files: 100%|██████████| 140/140 [41:12<00:00, 17.66s/it]


Index successfully written to: /home/ubuntu/data/faiss_indices/medCPT/medCPT_index.index
CSV file successfully written to: /home/ubuntu/data/faiss_indices/medCPT/medCPT_pmids.csv


In [4]:
index = faiss.read_index('faiss_indices/PM_index.index')

In [None]:
k = 10  # Number of nearest neighbors

query = np.random.rand(768).tolist()

distances, indices = index.search(query, k)

In [1]:
import requests
import numpy as np
import json

# URL of the Flask endpoint
url = 'http://localhost:5000/search'

# Generate a random vector of length 768
random_vector = np.random.rand(768).tolist()  # Convert numpy array directly to list

# Data for the POST request
data = {
    'queries': [random_vector]  # Ensure this is a list of lists
}

# Convert data to JSON before sending as POST request
json_data = json.dumps(data)

# Send the POST request
response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json_data)

# Output the response
print('Status Code:', response.status_code)
print('Response:', response.json())

Status Code: 200
Response: {'distances': [[348.77490234375, 348.9889221191406, 349.5247497558594, 349.7203369140625, 349.90228271484375, 349.9190979003906, 350.23382568359375, 350.36578369140625, 350.47930908203125, 350.5979309082031]], 'indices': [[470115, 1932016, 473742, 469270, 1405245, 670332, 1715754, 2382674, 1707872, 2141577]]}
