In [1]:
! pip install -q --upgrade gradio langchain langchain_community openai huggingface_hub datasets langchain_huggingface langchain_openai chromadb faiss-cpu
! pip install -q --upgrade datasets fsspec

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.2/54.2 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.3/323.3 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m725.5/725.5 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.1/512.1 kB[0m [31m25.9 MB/s[0m eta [36m0:

In [2]:
import os
import openai
import gradio as gr

from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma, FAISS
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

from datasets import load_dataset
from huggingface_hub import login
from google.colab import userdata

In [3]:
# environment setup
openai_api_key = userdata.get('OPENAI_API_KEY')
login(token=userdata.get('HF_TOKEN'))

In [4]:
# CONSTANTS
DB_NAME = 'vector_db'
MODEL = "gpt-3.5-turbo"

In [5]:
ds = load_dataset("enelpol/rag-mini-bioasq", "text-corpus")

README.md:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/35.3M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/40181 [00:00<?, ? examples/s]

In [6]:
documents = [
    Document(
        page_content=entry['passage'],
        metadata={"id": entry.get("id", "N/A")}
        )
    for entry in ds['test']
]

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
chunks = text_splitter.split_documents(documents)

In [8]:
len(chunks)

79850

In [9]:
embed_model = HuggingFaceEmbeddings(
    model_name="intfloat/e5-base"
    )

  embed_model = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/356 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
db = FAISS.from_documents(chunks, embedding=embed_model)
db.save_local(DB_NAME)

In [None]:
llm = ChatOpenAI(model_name=MODEL, temperature=0,api_key=openai_api_key)

memory = ConversationBufferMemory(memory_key='chat_history',return_message=True)

retriever = db.as_retriever()

chain = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

In [None]:
query = "What is the difference between Eucaryotic and Procaryotic organisms"
response = chain({"question": query})
print(response)

In [None]:
def chat(history, query):
    response = chain({"question": query})
    return response['answer']

In [None]:
view = gr.ChatInterface(fn=chat,type='messages',theme = gr.themes.Soft())
view.launch(inbrowser=True)