In [1]:
!pip install -r requirements.txt



In [2]:
import os
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
chat = ChatOpenAI(
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model='gpt-3.5-turbo'
)

In [5]:
!pip install datasets



In [6]:
import pandas as pd
dataset = pd.read_csv("medquad.csv")
dataset.head()

Unnamed: 0,qtype,Question,Answer
0,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...
1,symptoms,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...
2,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...
3,exams and tests,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos..."
4,treatment,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen..."


In [7]:
import pinecone

pinecone.init(
    api_key = os.environ["PINECONE_API_KEY_MAIN"],
    environment= os.environ["PINECONE_ENV"]
)

  from tqdm.autonotebook import tqdm


In [8]:
# import time

# index_name = 'med277-medquad'

# if index_name not in pinecone.list_indexes():
#     pinecone.create_index(
#         index_name,
#         dimension=1536,
#         metric='cosine'
#     )
#     # wait for index to finish initialization
#     while not pinecone.describe_index(index_name).status['ready']:
#         time.sleep(1)

# index = pinecone.Index(index_name)

In [9]:
# from tqdm.auto import tqdm  # for progress bar

# batch_size = 100
# data = df
# for i in tqdm(range(0, len(data), batch_size)):
#     i_end = min(len(data), i+batch_size)
#     # get batch of data
#     batch = data.iloc[i:i_end]
#     # generate unique ids for each chunk
#     ids = [str(i) for i, _ in batch.iterrows()]
#     # print(ids)
#    # get text to embed
#     question = [x['Question'] for _, x in batch.iterrows()]
#     # answer = [x['Answer'] for _, x in batch.iterrows()]
#     # qtype = [x['qtype'] for _, x in batch.iterrows()]
#     # embed text
#     embeds = embed_model.embed_documents(question)
#     # get metadata to store in Pinecone
#     metadata = [
#         {'question': x['Question'],
#          'qtype': x['qtype'],
#          'answer': x['Answer']} for i, x in batch.iterrows()
#     ]
#     # add to Pinecone
#     index.upsert(vectors=zip(ids, embeds, metadata))

In [10]:
#I have already created a vector database
index_name = 'med277-medquad'
index = pinecone.Index(index_name)

In [11]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.16407,
 'namespaces': {'': {'vector_count': 16407}},
 'total_vector_count': 16407}

In [12]:
from langchain.embeddings.openai import OpenAIEmbeddings
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

In [13]:
from langchain.vectorstores import Pinecone

text_field = "question"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)



In [14]:
query = "Who is at risk for Lymphocytic Choriomeningitis (LCM)?"
vectorstore.similarity_search(query, k=3)

[Document(page_content='Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?', metadata={'answer': 'LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents.  Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.', 'qtype': 'susceptibility'}),
 Document(page_content='Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?', metadata={'answer': 'Individuals of all ages who come into contact with urine, feces, saliva, or blood of wild mice are potentially at risk for infection. Owners of pet mice or hamsters may be at risk for infection if these animals originate from colonies that were contaminated with LCMV, or if their animals are 

In [15]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt

In [16]:
!pip install -q gradio

In [17]:
import gradio as gr
import openai
from langchain.llms import OpenAI

In [18]:
def unified_process(input_data):
    my_key = os.environ["OPENAI_API_KEY"]
    openai.api_key = my_key
    llm = OpenAI(temperature=1, openai_api_key=my_key)
    augmented_prompt = augment_prompt(input_data)
    return llm(augmented_prompt)

In [19]:
demo = gr.Interface(
    fn=unified_process,
    inputs = [gr.Textbox(placeholder="Paste text here...")],
    outputs="text")

In [20]:
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [21]:
dataset['Answer'][2]

'Individuals of all ages who come into contact with urine, feces, saliva, or blood of wild mice are potentially at risk for infection. Owners of pet mice or hamsters may be at risk for infection if these animals originate from colonies that were contaminated with LCMV, or if their animals are infected from other wild mice. Human fetuses are at risk of acquiring infection vertically from an infected mother. \n                \nLaboratory workers who work with the virus or handle infected animals are also at risk. However, this risk can be minimized by utilizing animals from sources that regularly test for the virus, wearing proper protective laboratory gear, and following appropriate safety precautions.'