In [1]:
%pip install llama-index-embeddings-openai
%pip install llama-index-vector-stores-pinecone
%pip install llama-index-llms-openai


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install llama-index



### Environment variables


In [3]:
!pip -q install python-dotenv pinecone-client llama-index pymupdf

### Setting up pinecone vector database

In [4]:
from dotenv import load_dotenv
import os

load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

In [5]:
from pinecone import Pinecone, Index, ServerlessSpec

pc = Pinecone(api_key = PINECONE_API_KEY)

  from tqdm.autonotebook import tqdm


In [6]:
index_name = "harrison-medical-data"

In [7]:
# dimensions are for text-embedding-ada-002
if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=1536,
        metric="euclidean",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

### Load data

In [8]:
import fitz 

file_path = "../data/medical_textbook.pdf"
doc = fitz.open(file_path)

### Split documents

In [9]:
from llama_index.core.node_parser import SentenceSplitter

text_parser = SentenceSplitter(
    chunk_size=1024,
    # separator=" ",
)

In [10]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_parser.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

### Construct Nodes from text chunks

In [11]:
from llama_index.core.schema import TextNode

nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [12]:
print(nodes[13].metadata)

{}


In [13]:
# print a sample node
print(nodes[0].get_content(metadata_mode="all"))




### Extracting metadata from each node

In [14]:

from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo")

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]
pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

100%|██████████| 5/5 [00:01<00:00,  3.52it/s]
100%|██████████| 1359/1359 [08:08<00:00,  2.78it/s]


### Embedding 

In [23]:
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding()

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

{'document_title': '"Exploring Diversity and Uniqueness in Medicine: Perspectives from Prominent Medical Professionals and Researchers"', 'questions_this_excerpt_can_answer': '1. What are the main symptoms and complications associated with different levels of radiation exposure, ranging from 0.7 to >20 Gy?\n2. How is Acute Radiation Sickness (ARS) typically treated, and what are the key management strategies recommended for individuals contaminated with radiation?\n3. What are the specific tests and monitoring protocols recommended for individuals exposed to different levels of radiation, and how do these tests help in assessing the severity of radiation sickness?'}


In [21]:
print(nodes[1233].metadata)


{'document_title': '"Exploring Diversity and Uniqueness in Medicine: Perspectives from Prominent Medical Professionals and Researchers"', 'questions_this_excerpt_can_answer': '1. What are the various uses of calcium therapy in medical treatment, including for conditions such as hyperkalemia, hypocalcemia, and osteoporosis?\n2. How is Campylobacteriosis related to proctocolitis, and what are the symptoms and treatment options for this condition?\n3. How is the Calvert formula used in medical practice, and what specific calculations or measurements does it involve?'}


In [16]:
from llama_index.vector_stores.pinecone import PineconeVectorStore

### Storing the nodes into vector store

In [34]:
pinecone_index = pc.Index(index_name)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

# if you want a namespace(useful for summarizing data/kinda):
# vs = PineconeVectorStore(pinecone_index=pinecone_index, namespace = "name")

vector_store.add(nodes)



Upserted vectors: 100%|██████████| 1359/1359 [00:42<00:00, 31.94it/s]


['5b527b77-e5ff-422f-bd5a-bd034fc79a16',
 'e47ce8be-ef12-4acf-92e5-cd1ccf69e451',
 'e05d9b06-3f1b-4eed-a273-aabc48a22fa6',
 '94dd3dff-1898-4089-b457-49a975d4a7a7',
 '7cf9dc0e-2faa-49fd-b05c-7d6e2f99c5d3',
 'fbec9048-ef2e-4541-afc6-9e731bb3cd01',
 '72428e12-fbb8-455f-8fc6-eb1d6a483908',
 '057e18a6-eb77-4c9d-a7e1-e2d96b22aa5e',
 'e83f124d-7726-4477-bcff-7e0403f09d32',
 '89a89421-b70c-48ab-8178-9a85811a9cdb',
 '44c7f306-2cf7-4001-9571-428f68a7c83b',
 'bfed9d45-4137-4e08-a4c2-59eb491e1264',
 'aa6820be-8ab3-463c-b8eb-2b267dc89d62',
 'a6593a7d-5fb2-49b0-9d0c-b76f9f8609cf',
 '627e20f0-2cf6-4181-a582-00dc02f96485',
 '5d1ea718-e3e4-4e95-9717-691392a75a02',
 'c91c70b2-59a4-460c-808c-db65131e2c38',
 '6eb044e7-494b-43f1-8503-908d1bcff31f',
 'd0cb2529-bde9-48c0-95e6-e4af341d4160',
 '30cf95dd-c582-44f2-83f6-65decaf2be9b',
 '59efee39-aa82-4103-a05b-2fe4de40d24c',
 '8ed124fd-7ead-4955-b55e-add3ed228f2f',
 '30ed9be3-433f-45f2-8e00-e751d6d66645',
 '85e6abfa-4707-42af-a2d2-8307b5284083',
 'deb528dc-5faa-

In [35]:
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext

index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine()

query_str = "In 2 sentences, tell me what I should do if i consume a mushroom and i am feeling a burning sensation"



In [36]:
response = query_engine.query(query_str)
print(str(response))


Seek immediate medical attention if you consume a mushroom and experience a burning sensation, as it could indicate poisoning or an adverse reaction. Refrain from consuming any more of the mushroom and try to preserve a sample of it for identification by medical professionals.
