In [2]:
import os
os.chdir("../")

In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sentence_transformers

  from tqdm.autonotebook import tqdm, trange


In [4]:
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [5]:
extracted_data = load_pdf_file(data ='Data/')

In [7]:
#extracted_data

In [8]:
#Split the data into chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
text_chunks = text_split(extracted_data)
print('no of text chunks', len(text_chunks))

no of text chunks 11435


In [9]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings

In [10]:
def downlaod_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [None]:
# OPENAI_API_KEY = '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'
# def downlaod_openai_embeddings():
#     embeddings = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key = OPENAI_API_KEY)
#     return embeddings


In [11]:
embeddings = downlaod_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [29]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
#PINECONE_API_KEY = 'pcsk_4P9idL_GcDWa8bedaTzomwuDuCkXREGw5YySun3RdyBhHwuASpEEyS1pbbqp5F5ZqQmtPS'

In [34]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "fachatbot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [4]:
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [36]:
#Embed each chunk and upsert the embeddings into your Pinecone index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings
)

In [12]:
# Load existing index
index_name = "fachatbot"
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)


In [13]:
retrever = docsearch.as_retriever(search_type = "similarity", search_kwargs={"k":3})

In [14]:
retrieved_docs = retrever.invoke('what is FA')

In [15]:
retrieved_docs

[Document(id='fb8f62fe-7e0d-486d-b5cb-da7d1da10736', metadata={'page': 1.0, 'source': 'Data\\ACN3-4-622.pdf'}, page_content='and test the independent association between the FA-\nassociated DM and level of function.9,10 Also, we describe\nthe variation in approaches to DM management in FA.\nMaterials and Methods\nStudy design and participants\nThis analysis used FACOMS, a longitudinal, prospective\nnatural history study of FA. 811 individuals with FA were\nenrolled between 2004 and 2015, and re-evaluated annu-\nally at 12 international sites.6 Information from the most\nrecent visit was used in the present study.'),
 Document(id='bd3abcac-e914-4553-8e28-297a59301769', metadata={'page': 5.0, 'source': 'Data\\WNL-2022-201428.pdf'}, page_content='Range —— 349.5–1,105 681.5 –1,175\nAbbreviation: FA = Friedreich ataxia.\nNeurology.org/N Neurology | Volume 100, Number 8 | February 21, 2023 e813\nCopyright © 2022 American Academy of Neurology. Unauthorized reproduction of this article is proh

In [16]:
from langchain_openai import OpenAI

In [17]:
llm = OpenAI(temperature=0.4, max_tokens=500, openai_api_key = OPENAI_API_KEY)

In [18]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks based multiple research papers. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer or unsure or even a single word from the human message after 'what is' not included in the retreived context, say that you "
    "don't know. Use three sentences maximun and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [19]:
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('human', '{input}')
    ]
)

In [20]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retrever, question_answer_chain)

In [31]:
response = rag_chain.invoke({'input' : 'there is a 10 year old child with mFARS, which is is 35, is there a relationship with upperlimb score with this and can you guess a upper limb score?'})
print(response['answer'])



The upper limb scores (FARS B) contribute substantially to the overall decline in mFARS for children aged 8-11 years. However, after reaching 12 years of age, there is no significant decline in upper limb scores. It is difficult to estimate a specific upper limb score for a 10-year-old child with mFARS without more information, as the upper limb scores can vary greatly in this age group.


In [39]:
for document in response["context"]:
    print(document.page_content)
    print(document.metadata['page'])
    print(document.metadata['source'])
    print(os.path.basename(document.metadata['source']))
    print('------------------------------------')

and to a lesser extent in the age group 8– 11 years, upper limb
scores (FARS B, Figure 4C) contributed substantially to the
overall decline. This result was consistent with the estimated
decline for the early-onset group (shown earlier), who are
predominantly enrolled at young ages. Upper limb (FARS B)
scores also declined in the age group 8– 11 years, but not after
reaching 12 years of age. The mean USS changes were lower
in children younger than 8 years but showed a consistent
5.0
Data\WNL-2022-200876.pdf
WNL-2022-200876.pdf
------------------------------------
total mFARS score due to high changes in upper limb
function/FARS B that diminish after approximately 12 years
of age. The USS might also have other beneﬁts: The recent
MOXIE study25 showed a statistically signiﬁcant eﬀect of
omaveloxolone compared with placebo in total mFARS,
conﬁrming its sensitivity to change and to potential treatment
eﬀects. However, a particularly large eﬀect was also observed
in the USS, which, in contr