In [2]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [3]:
#load openAI api key
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [4]:
# Initialise LLM with required params
llm = OpenAI(temperature=0.9, max_tokens=500) 

  llm = OpenAI(temperature=0.9, max_tokens=500)


In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tiyas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tiyas\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.bcrf.org/about-breast-cancer/invasive-ductal-carcinoma/",
    "https://www.bcrf.org/about-breast-cancer/invasive-lobular-carcinoma/",
    "https://pmc.ncbi.nlm.nih.gov/articles/PMC8204849/",
    "https://breastcancernow.org/about-breast-cancer/diagnosis/types-of-breast-cancer/papillary-breast-cancer/",
    "https://www.cancer.org/cancer/types/breast-cancer/about/types-of-breast-cancer/paget-disease-of-the-nipple.html"
])
data = loaders.load() 
len(data)

5

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [9]:
len(docs)

20

In [10]:
docs[15]

Document(metadata={'source': 'https://www.cancer.org/cancer/types/breast-cancer/about/types-of-breast-cancer/paget-disease-of-the-nipple.html'}, page_content='Skip to main content\n\nDownload Section as PDF\n\nPaget Disease of the Breast\n\nPaget disease of the breast is a rare type of breast cancer involving the skin of the nipple and the areola (the dark circle around the nipple).\n\nOn this page\n\n[show] [hide]\n\nSigns and symptoms of Paget disease of the breast\n\nHow is Paget disease of the breast diagnosed?\n\nTreating Paget disease of the breast\n\nSigns and symptoms of Paget disease of the breast\n\nThe skin of the nipple and areola often looks crusted, scaly, and red. There may be blood or yellow fluid coming out of the nipple. Sometimes the nipple looks flat or inverted. It also might burn or itch. Your doctor might try to treat this as eczema first, and if it does not improve, recommend a biopsy.\n\nPaget disease usually affects only one breast. In 80-90% of cases, it’s us

In [11]:
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

  embeddings = OpenAIEmbeddings()


In [12]:
# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_openai, f)

In [13]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [14]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain



In [15]:
query = "what do you mean by paget disease?"

langchain.debug=True

chain({"question": query}, return_only_outputs=True)

  chain({"question": query}, return_only_outputs=True)


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what do you mean by paget disease?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Skip to main content\n\nDownload Section as PDF\n\nPaget Disease of the Breast\n\nPaget disease of the breast is a rare type of breast cancer involving the skin of the nipple and the areola (the dark circle around the nipple).\n\nOn this page\n\n[show] [hide]\n\nSigns and symptoms of Paget disease of the breast\n\nHow is Paget disease of the breast diagnosed?\n\nTreating Paget disease of the breast\n\nSigns and symptoms of Paget disease of the breast\n\nThe skin of the nipple and areola o

{'answer': ' Paget disease of the breast is a rare type of breast cancer involving the skin of the nipple and the areola. It is diagnosed by a biopsy and can be treated with surgery, radiation therapy, and/or chemotherapy. \n',
 'sources': 'https://www.cancer.org/cancer/types/breast-cancer/about/types-of-breast-cancer/paget-disease-of-the-nipple.html'}