In [12]:
! pip install -q gradio langchain langchain_community openai huggingface_hub datasets langchain_openai chromadb faiss-cpu
! pip install -q --upgrade datasets fsspec

In [2]:
import os
import openai
import gradio as gr

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma, FAISS
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

from datasets import load_dataset
from huggingface_hub import login
from google.colab import userdata

In [4]:
openai_api_key = userdata.get('OPEN_API_KEY')

In [27]:
MODEL = "gpt-3.5-turbo"

In [5]:
login(token=userdata.get('HF_TOKEN'))

In [10]:
db_name = 'vector_db'

In [6]:
ds = load_dataset("ccdv/pubmed-summarization", split="train")

README.md:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

train-00000-of-00005.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/208M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/207M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/211M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/59.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/58.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/119924 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6633 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6658 [00:00<?, ? examples/s]

In [13]:
subset = ds.select(range(10000))

In [14]:
documents = [
    Document(page_content=entry['abstract'], metadata={"pmid": entry.get("pmid", "N/A")})
    for entry in subset
]

In [24]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
chunks = text_splitter.split_documents(documents)

In [26]:
embed_model = HuggingFaceEmbeddings(
    model_name="intfloat/e5-base",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"batch_size": 64}
    )
db = FAISS.from_documents(chunks, embedding=embed_model)
db.save_local(db_name)

  embed_model = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [28]:
llm = ChatOpenAI(model=MODEL,temperature=0.5,api_key=openai_api_key)


memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

retriever = db.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [None]:
query = "What are the effects of school nutrition programs on child health?"
response = qa({"question": query})
print(response)

  response = qa({"question": query})


{'question': 'What are the effects of school nutrition programs on child health?', 'chat_history': [HumanMessage(content='What are the effects of school nutrition programs on child health?', additional_kwargs={}, response_metadata={}), AIMessage(content="The effects of school nutrition programs on child health can include improvements in nutritional status, such as a reduction in the prevalence of underweight among school-aged children, particularly among girls. These programs can lead to significant increases in knowledge about nutrition and healthy eating among students. Additionally, providing nutritious snacks in schools can positively impact growth monitoring indices. However, the effectiveness may vary by gender, as some studies indicate no significant changes among boys or the total population. Overall, school nutrition programs have the potential to improve children's health and nutritional outcomes.", additional_kwargs={}, response_metadata={})], 'answer': "The effects of scho

In [None]:
def chat(history, query):
    response = qa({"question": query})
    return response['answer']

In [None]:
view = gr.ChatInterface(fn=chat,type='messages',theme = gr.themes.Soft())

In [None]:
view.launch(inbrowser=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://64ea681fcb8c74617a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [29]:
import shutil

# Set the input directory and output zip name
input_dir = '/content/Pubmed/vector_db'
output_zip = '/content/vector_db'

# Create the zip file (it will produce vector_db.zip in /content)
shutil.make_archive(output_zip, 'zip', input_dir)


'/content/vector_db.zip'

In [32]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="/content/Pubmed/",  # Local folder in Colab
    repo_id="SHAH-MEER/pubmed_rag",              # Your Space
    repo_type="space",                           # Important for Spaces
    path_in_repo="",                             # Upload to root (or specify a subfolder like "data/")
)

CommitInfo(commit_url='https://huggingface.co/spaces/SHAH-MEER/pubmed_rag/commit/15a536a062e39cfe2528323e6364e0b8d1b92d87', commit_message='Upload folder using huggingface_hub', commit_description='', oid='15a536a062e39cfe2528323e6364e0b8d1b92d87', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spaces/SHAH-MEER/pubmed_rag', endpoint='https://huggingface.co', repo_type='space', repo_id='SHAH-MEER/pubmed_rag'), pr_revision=None, pr_num=None)