In [2]:
import os
os.chdir("../")

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sentence_transformers

In [5]:
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [6]:
extracted_data = load_pdf_file(data ='Data/')

In [8]:
#extracted_data

In [17]:
#Split the data into chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [18]:
text_chunks = text_split(extracted_data)
print('no of text chunks', len(text_chunks))

no of text chunks 210


In [29]:
from langchain.embeddings import HuggingFaceEmbeddings

In [30]:
def downlaod_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [31]:
embeddings = downlaod_hugging_face_embeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [33]:
from dotenv import load_dotenv
load_dotenv()

True

In [48]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [37]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "fachatbot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [49]:
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [43]:
#Embed each chunk and upsert the embeddings into your Pinecone index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings
)

In [44]:
# Load existing index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)


In [45]:
retrever = docsearch.as_retriever(search_type = "similarity", search_kwargs={"k":3})

In [46]:
retrieved_docs = retrever.invoke('what is Acne')

In [47]:
retrieved_docs

[Document(id='af74e742-5668-4f01-bc39-04b3cfdccda2', metadata={'page': 12.0, 'source': 'Data\\WNL-2022-201428.pdf'}, page_content='dren and adults with FA. A fundamental understanding of the\nwidespread occurrence and importance of the symptoms that\nthose with FA face in their daily lives is relevant to those who\nintend to provide clinical care for this population. Knowledge\nof these symptoms is also relevant for those in the process of\ndeveloping novel therapeutics for FA and for those who wish\nto research and study therapies to reduce the symptomatic\nburden of this disease.\nStudy Funding'),
 Document(id='f06c7a6f-9100-493f-8ef9-09433f5bfcd4', metadata={'page': 1.0, 'source': 'Data\\WNL-2022-201428.pdf'}, page_content='Friedreich ataxia (FA) is an autosomal recessive neurode-\ngenerative disease that presents with a variety of clinical\nsymptoms, including a loss of coordination (ataxia) in the\narms and legs, fatigue, muscle loss, vision impairment, hearing\nloss, slurred spee

In [50]:
from langchain_openai import OpenAI

In [51]:
llm = OpenAI(temperature=0.4, max_tokens=500)

In [70]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks based multiple research papers. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer or unsure or even a single word from the human message after 'what is' not included in the retreived context, say that you "
    "don't know. Use three sentences maximun and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [71]:
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('human', '{input}')
    ]
)

In [72]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retrever, question_answer_chain)

In [76]:
response = rag_chain.invoke({'input' : 'what is GAA?'})
print(response['answer'])


GAA is a genetic mutation that is strongly linked to the onset of symptoms in FA, a condition that affects individuals in a variety of ways and at different rates. The smaller GAA expansion has been found to have the strongest association with disease severity, making it a potential target for future treatments. It is also being studied for its potential impact on other common symptoms of FA, such as muscle stiffness.
