In [34]:
from opensearch_utils import *
import os
from datasets import load_dataset
from tqdm import tqdm

HUGGINGFACE_TOKEN = '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'
HUGGINGFACE_USERNAME = 'prio7777777'
HUGGINGFACE_DATASET_NAME = 'prio7777777/pubmed-demo'
vectored_data_path = os.path.join(os.getcwd(),"pubmed_demo_data.pkl")


In [35]:
!pip install langchain sentence-transformers
!pip install --upgrade transformers

from sentence_transformers import SentenceTransformer, util
## load model
# Load model directly

from transformers import AutoTokenizer, AutoModelForMaskedLM
embed_model_id = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract"
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")
# model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')



In [36]:
# load dataset
if not os.path.exists(vectored_data_path):
    dataset = load_dataset(HUGGINGFACE_DATASET_NAME,token=HUGGINGFACE_TOKEN)
    ## statistics about the dataset
    print("Average title length in tokens:", sum(len(doc['Title'].split()) for doc in dataset['train']) / len(dataset['train']))
    print("Average abstract length in tokens:", sum(len(doc['Abstract'].split()) for doc in dataset['train']) / len(dataset['train']))


In [37]:
connection_settings = {
    'DB_USERNAME': 'admin',
    'DB_PASSWORD': 'admin',
    'DB_HOSTNAME': 'localhost',
    'DB_PORT': '9200',
}

database_connection = opensearch_connection('pubmed-index',connection_settings=connection_settings)

Trying to connect...
Connected to OpenSearch {'name': 'opensearch-node1', 'cluster_name': 'opensearch-cluster', 'cluster_uuid': 'h2IMsAzEQ3WP1JlAoLdLVA', 'version': {'distribution': 'opensearch', 'number': '2.11.1', 'build_type': 'tar', 'build_hash': '6b1986e964d440be9137eba1413015c31c5a7752', 'build_date': '2023-11-29T21:43:10.135035992Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}
Creating index...
Successfully created index


In [38]:
# import SentenceTransformer and SentenceTransformersTokenTextSplitter classes
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

# create a SentenceTransformersTokenTextSplitter object
splitter = SentenceTransformersTokenTextSplitter(
    model_name=embed_model_id,  # specify the model used for tokenization
    chunk_overlap=10,  # set the overlap between consecutive text chunks
)


No sentence-transformers model found with name C:\Users\priot/.cache\torch\sentence_transformers\microsoft_BiomedNLP-BiomedBERT-base-uncased-abstract. Creating a new one with MEAN pooling.


In [39]:
vectored_data = []
if not os.path.exists(vectored_data_path):

    for item in tqdm(dataset['train']):
        title = item['Title']
        abstract = item['Abstract'].replace("\n"," ")

        chunks = splitter.split_text(text=abstract)  # split the text into chunks

        for j, chunk in enumerate(chunks):
            metadata = {
                "title": title,
                "chunk_id": j,
                "chunk_text": chunk,
            }

            embedding = model.encode(chunk).tolist()

            ##TODO: create unique identifier
            
            vectored_data.append((metadata, embedding))        


In [40]:
# define a function to store the list
def store_list(data, filename):
  """
  Stores a list of tuples containing (id, embedding, metadata) to a file.

  Args:
      data: A list of tuples containing (id, embedding, metadata).
      filename: The filename to store the data.
  """
  with open(filename, "wb") as f:
      # use pickle to serialize the data
      import pickle
      pickle.dump(data, f)


# define a function to read the list
def read_list(filename):
  """
  Reads a list of tuples containing (embedding, metadata) from a file.

  Args:
      filename: The filename to read the data from.

  Returns:
      A list of tuples containing (id, embedding, metadata).
  """
  with open(filename, "rb") as f:
      # Use pickle to deserialize the data
      import pickle
      data = pickle.load(f)
  return data

In [41]:
if not os.path.exists(vectored_data_path):
    store_list(vectored_data, "pubmed_demo_data.pkl")

vectored_data = read_list("pubmed_demo_data.pkl")

In [42]:
test = ['is empty' for item in vectored_data if item[1] == []]

In [43]:
## convert embedding to numpy array
import numpy as np
embeddings = np.array([item[1] for item in vectored_data])
embeddings.shape

(32838, 384)

In [44]:
print(embeddings)

[[-0.13884073 -0.08436698  0.1584886  ...  0.42079914  0.03313481
   0.00922636]
 [-0.29567137 -0.15653485 -0.04584132 ...  0.18007696  0.22288924
  -0.09721413]
 [ 0.23207188 -0.06122811  0.12175538 ...  0.09339068  0.04582798
  -0.22307989]
 ...
 [-0.27166149  0.01676166  0.10427371 ... -0.27438816 -0.24200818
   0.19671562]
 [-0.04534031 -0.2993252   0.23102948 ...  0.00892905 -0.03001187
  -0.04205467]
 [-0.22488408 -0.04139412 -0.20064259 ... -0.03342036 -0.01439558
  -0.15747528]]


In [45]:
# loadArticlesVector(database_connection,vectored_data,'pubmed-index')

Saving articles to database: 100%|██████████| 32838/32838 [1:15:18<00:00,  7.27it/s]     


In [52]:
query = ['Retina image segmentation.']
query_vector = model.encode(query)
query_embedding = query_vector.tolist()[0]

In [53]:
# define the OpenSearch search body
body = {
    # **Query:** match all documents and score them based on a custom script
    "query": {
        "script_score": {
            # match all documents
            "query": {
                "match_all": {}
            },
            # define a script to calculate the score
            "script": {
                # since cosine similarity ranges between -1 and 1 and
                # opensearch is not able to process negative cosine similarity score
                # therefore +1.0 is added
                "source": "cosineSimilarity(params.queryVector, doc['vector']) + 1.0",
                # pass the query vector as a parameter to the script
                "params": {
                    "queryVector": query_embedding
                }
            }
        }
    },
    # filter results with a minimum score of 1.45
    "min_score": 1.45
}

# set the maximum number of results to retrieve
size = 1000

# perform the search with a 120-second timeout
aux_results = database_connection.search(
    index='pubmed-index',
    body=body,
    size=size,
    request_timeout=120
)

In [54]:
# loop over each returned hit in the search results
for result in aux_results["hits"]["hits"]:
    # print a separator for each result
    print("-" * 10)
    # print the score of the document
    print(f"Score: {result['_score']}")
    # print the title of the document stored in the "_source" field
    print(f"Title: {result['_source']['title']}")

----------
Score: 1.6836026
Title: Aiding the Diagnosis of Diabetic and Hypertensive Retinopathy Using Artificial Intelligence Based Semantic Segmentation
----------
Score: 1.58722
Title: Recent advances in imaging technologies for assessment of retinal diseases
----------
Score: 1.5870645
Title: Validation of automated artificial intelligence segmentation of optical coherence tomography images
----------
Score: 1.5577049
Title: Artificial Intelligence in Ophthalmology A Meta Analysis of Deep Learning Models for Retinal Vessels Segmentation
----------
Score: 1.5576472
Title: EyeHealer A large scale anterior eye segment dataset with eye structure and lesion annotations
----------
Score: 1.5521336
Title: Development of a Fundus Image Based Deep Learning Diagnostic Tool for Various Retinal Diseases
----------
Score: 1.5515354
Title: Application of Deep Learning to Retinal Image Based Oculomics for Evaluation of Systemic Health A Review
----------
Score: 1.549217
Title: Fundamentals of art