In [57]:
from google.colab import drive
drive.mount('/content/drive')

import os

os.chdir("/content/drive/MyDrive/Colab Notebooks/Searched Ranking/")
os.getcwd()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/Colab Notebooks/Searched Ranking'

In [58]:
# !pip install pinecone-client sentence-transformers torch

In [59]:
# !pip install openai

In [77]:
import pandas as pd
import numpy as np
import pinecone
import torch
import sentence_transformers as st
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from pprint import pprint
from transformers import pipeline

import openai as op
from openai import OpenAI
import base64
import requests

print(pd.__version__)
print(np.__version__)
print(torch.__version__)
print(pinecone.__version__)
print(st.__version__)
print(op.__version__)

1.5.3
1.23.5
2.1.0+cu121
2.2.4
2.2.2
1.3.9


In [61]:
# Read a CSV in a table
df = pd.read_csv('data/data_curated.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract,index
0,0,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,0
1,1,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,1
2,2,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,2
3,3,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,3
4,4,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,4


In [62]:
df.shape

(821007, 4)

In [63]:
df["merged"] = "Title of the paper: " + df["title"] + " Abstract for the paper: " + df["abstract"]

df.head()

Unnamed: 0.1,Unnamed: 0,title,abstract,index,merged
0,0,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,0,Title of the paper: Clinical features of cultu...
1,1,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,1,Title of the paper: Nitric oxide: a pro-inflam...
2,2,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,2,Title of the paper: Surfactant protein-D and p...
3,3,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,3,Title of the paper: Role of endothelin-1 in lu...
4,4,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,4,Title of the paper: Gene expression in epithel...


In [64]:
# Initializing pinecone connect.
pinecone.init(api_key="96c2b4b0-dd72-4afa-a819-0070dc80681c", environment="gcp-starter")

In [78]:
# Initializing openAI connect
openai_key = "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"

In [66]:
# We are creating index inside pinecone, where we will store our vectors for all articles. - used the siddp274 email.
index_name = "eqa" # extractive-question-nswering

# check if the abstractive-question-answering index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=384,
        metric="cosine"
    )

# connect to abstractive-question-answering index we created
index = pinecone.Index(index_name)

In [67]:
# We will use a SentenceTransformer model named multi-qa-MiniLM-L6-cos-v1 designed for semantic search and trained on 215M (question, answer)
#pairs from diverse sources as our retriever.

In [68]:
# Calling the retriever - 384 dimensional dense vector space and was designed for semantic search
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load the retriever model from huggingface model hub
retriever = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1', device=device)
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [69]:
import sys

sys.getsizeof(df['merged'].iloc[0])

for _, row in df.iterrows():
        abstract_text = row['merged']
        if sys.getsizeof(abstract_text) > 40960:
          print(_)
          break

2222


In [70]:
# Generate Embeddings and Upsert

"""
When passing the documents to Pinecone, we need an id (a unique value), context embedding, and metadata for each document representing context passages
in the dataset. The metadata is a dictionary containing data relevant to our embeddings, such as the article title, context passage, etc.
"""

# we will use batches of 64
batch_size = 64

# Taking subset of data because it takes long ass time to create retrievable vectors for the index.
for i in tqdm(range(0, 1000, batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = retriever.encode(batch['merged'].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient='records')
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

  0%|          | 0/16 [00:00<?, ?it/s]

{'dimension': 384,
 'index_fullness': 0.00576,
 'namespaces': {'': {'vector_count': 576}},
 'total_vector_count': 576}

In [79]:
# Get the openai model here
reader = None
# checkout the last code block for using OpenAI gpt-4 as reader.

In [72]:
# Another model used

model_name = 'deepset/electra-base-squad2'
# load the reader model into a question-answering pipeline
reader = pipeline(tokenizer=model_name, model=model_name, task='question-answering', device=device)
reader

<transformers.pipelines.question_answering.QuestionAnsweringPipeline at 0x7e965fa82bc0>

In [73]:
# gets context passages from the pinecone index
def get_context(question, top_k):
    # generate embeddings for the question
    xq = retriever.encode([question]).tolist()
    # search pinecone index for context passage with the answer
    xc = index.query(xq, top_k=top_k, include_metadata=True)
    # extract the context passage from pinecone search result
    # print(xc["matches"])
    c = [(x["metadata"]['merged'], x["metadata"]['title'], x['score']) for x in xc["matches"]]
    return c

In [74]:
# extracts answer from the context passage
def extract_answer(question, context):
    results = []
    for c in context:
        # feed the reader the question and contexts to extract answers
        answer = reader(question=question, context=c[0]) #(abstract, title, score)
        # add the context to answer dict for printing both together
        answer["context"] = c[0]
        results.append(answer)
    # sort the result based on the score from reader model
    sorted_result = pprint(sorted(results, key=lambda x: x['score'], reverse=True))
    return sorted_result

In [91]:
question = "Are there any papers on Clinical features of culture-proven Mycoplasma?"
context = get_context(question, top_k = 3)
context

[('Title of the paper: Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia Abstract for the paper: OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolat

In [76]:
# Getting th exact answer
extract_answer(question, context)

[{'answer': 'Clinical features of culture-proven Mycoplasma pneumoniae',
  'context': 'Title of the paper: Clinical features of culture-proven '
             'Mycoplasma pneumoniae infections at King Abdulaziz University '
             'Hospital, Jeddah, Saudi Arabia Abstract for the paper: '
             'OBJECTIVE: This retrospective chart review describes the '
             'epidemiology and clinical features of 40 patients with '
             'culture-proven Mycoplasma pneumoniae infections at King '
             'Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: '
             'Patients with positive M. pneumoniae cultures from respiratory '
             'specimens from January 1997 through December 1998 were '
             'identified through the Microbiology records. Charts of patients '
             'were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) '
             'of whom required admission. Most infections (92.5%) were '
             'community-acqui

In [93]:
" ".join([x[0] for x in context])

"Title of the paper: Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia Abstract for the paper: OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates

In [95]:
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai_key}"
}

history = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": f"You will be provided {len(context)} research paper content, can you look into them and provide a score for each paper in regards to the question - {question}"
            },
            {
                "type": "text",
                "text": " ".join([x[0] for x in context])
            }]
      }
    ]

payload = {
    "model": "gpt-4-vision-preview",
    "messages": history,
    "max_tokens": 300
}

response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
response.raise_for_status()  # Raise an HTTPError for bad responses

# print(response)
output_message = response.json()['choices'][0]['message']
output_message = output_message['content']
print(output_message)

Based on the provided abstracts, the scores for each paper in regards to the question "Are there any papers on Clinical features of culture-proven Mycoplasma?" are as follows:

1. Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia: 10/10 (The paper directly addresses the clinical features of culture-proven Mycoplasma pneumoniae infections)

2. Understanding the clinical spectrum of complicated Plasmodium vivax malaria: a systematic review on the contributions of the Brazilian literature: 0/10 (The paper focuses on Plasmodium vivax malaria and does not mention culture-proven Mycoplasma)

3. Severe Childhood Malaria Syndromes Defined by Plasma Proteome Profiles: 0/10 (The paper focuses on severe childhood malaria syndromes and does not mention culture-proven Mycoplasma)
