In [None]:
## Installing the necessary packages
!pip install pypdf
!pip install langchain
!pip install faiss-gpu

In [26]:
import pandas as pd
import numpy as np

## To remove warnings
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', -1)

from langchain.indexes import VectorstoreIndexCreator
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, PyPDFLoader
from google.colab import drive
drive.mount('/content/drive')

In [27]:
## Doc is uploaded in the google drive
loader = PyPDFLoader("/content/drive/MyDrive/Quantiphi/ConceptsofBiology-WEB.pdf")
pages = loader.load_and_split()

In [32]:
# Printing the total number of pages and content in page 3
print(len(pages))
pages[3]

733


Document(page_content='Your Journey to Biology Success\nLearn with us today!\nopenstax.org/kinetic          OpenStax Kinetic will help you uncover who you are as a \nscience learner and provide helpful personalized feedback \nas you explore your options for a future in science.\nThroughout the year, participate in a range of free \nresearch studies on Kinetic to deepen your:\n•Foundational science knowledge\n•Early science and math experiences\n•STEM interests and matching careers\n•Career and vocational interests\nTake part in fun, researcher-created activities \nto explore biological concepts like RNA, \ncancer, and the cell cycle! \nParticipation supports important learning \nresearch to help improve digital learning \nnationwide. All while giving you a chance \nat earning amazing prizes.\nLearn while you earn \nwith OpenStax Kinetic!\nGet involved, get learning, and get \nrewarded today with OpenStax Kinetic!', metadata={'source': '/content/drive/MyDrive/Quantiphi/ConceptsofBiology

In [93]:
## As per the problem statement its allowed to take only 2 pages
final_Docs = pages[5:7]

In [104]:
# Printing the content in the variable final_Docs, so that questions can be asked from this portion
final_Docs

[Document(page_content='CHAP TER 4\nHow Cel ls Ob tain Ener gy 89\nIntroduction 89\n4.1Ener gy and Metabolism 90\n4.2Glycolysis 100\n4.3Citric Acid Cy cle and Oxidativ e Phosphor ylation 101\n4.4Fermentation 105\n4.5Connections t o Other Metabolic P athways 108\nKey Terms 110\nChap ter Summar y 111\nVisual Connection Ques tions 112\nReview Ques tions 112\nCritical Thinking Ques tions 113\nCHAP TER 5\nPho tosynthesis 115\nIntroduction 115\n5.1Overview of Phot osynthesis 115\n5.2The Light -Dependent R eactions o f Phot osynthesis 120\n5.3The Cal vin Cy cle 124\nKey Terms 129\nChap ter Summar y 129\nVisual Connection Ques tions 130\nReview Ques tions 130\nCritical Thinking Ques tions 130\nUNI T2CELL DIVISION AND GENET ICS\nCHAP TER 6\nReproduction a t the Cel lular L evel 133\nIntroduction 133\n6.1The Genome 133\n6.2The Cel l Cycle 135\n6.3Canc er and the Cel l Cycle 141\n6.4Prokaryotic Cel l Division 142\nKey Terms 146\nChap ter Summar y 147\nVisual Connection Ques tions 148\nReview Ques

In [95]:
## hugging face token id for my username
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [96]:
from langchain.llms import HuggingFaceHub
from langchain import PromptTemplate, LLMChain

#We choose to use HuggingFaceHub's llm for langchain.llm
#1st element of LLM_Chain -> LLM
repo_id = "google/flan-t5-xxl"
llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 64}
)

#2nd element of LLM_Chain -> Prompt template
question = "Who won the FIFA World Cup in the year 1994? "
template = """Question: {question}
System: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])

In [97]:
##Sample run test
llm_chain = LLMChain(prompt=prompt, llm=llm)

print(llm_chain.run(question))

Brazil won the FIFA World Cup in the year 1994. The answer is Brazil.


In [98]:
# Embeddings ( download binaries, required to create the embeddings for HuggingFace models.)
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

#Create the vectorized db
#feed our chunked documents in a vector store
from langchain.vectorstores import FAISS
db = FAISS.from_documents(final_Docs, embeddings)

In [100]:
# Find the similarity of  User input/ query/ question  from VectorStore by using similarity_search
query = "how many chapters"
docs = db.similarity_search(query)
len(docs)

2

In [101]:
## Showcasing first sample output
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=query)

'10'

In [103]:
## Showcasing second sample question output
query = "what is chapter 10"
docs = db.similarity_search(query)
len(docs)

from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=query)

'Biotechnolog y'