
# Retriever and Chain with Langchain

In [15]:
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader("sample_pdf.pdf")
docs=loader.load()
docs

could not convert string to float: '0.00-51177066' : FloatObject (b'0.00-51177066') invalid; use 0.0 instead
could not convert string to float: '0.00-60790265' : FloatObject (b'0.00-60790265') invalid; use 0.0 instead
could not convert string to float: '0.00-56221883' : FloatObject (b'0.00-56221883') invalid; use 0.0 instead


[Document(page_content='Unsupervised Deep Embedding for Clustering Analysis\nJunyuan Xie JXIE@CS.WASHINGTON .EDU\nUniversity of Washington\nRoss Girshick RBG@FB.COM\nFacebook AI Research (FAIR)\nAli Farhadi ALI@CS.WASHINGTON .EDU\nUniversity of Washington\nAbstract\nClustering is central to many data-driven appli-\ncation domains and has been studied extensively\nin terms of distance functions and grouping al-\ngorithms. Relatively little work has focused on\nlearning representations for clustering. In this\npaper, we propose Deep Embedded Clustering\n(DEC), a method that simultaneously learns fea-\nture representations and cluster assignments us-\ning deep neural networks. DEC learns a map-\nping from the data space to a lower-dimensional\nfeature space in which it iteratively optimizes a\nclustering objective. Our experimental evalua-\ntions on image and text corpora show signiﬁcant\nimprovement over state-of-the-art methods.\n1. Introduction\nClustering, an essential data analysis a

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
documents=text_splitter.split_documents(docs)
documents[:5]

[Document(page_content='Unsupervised Deep Embedding for Clustering Analysis\nJunyuan Xie JXIE@CS.WASHINGTON .EDU\nUniversity of Washington\nRoss Girshick RBG@FB.COM\nFacebook AI Research (FAIR)\nAli Farhadi ALI@CS.WASHINGTON .EDU\nUniversity of Washington\nAbstract\nClustering is central to many data-driven appli-\ncation domains and has been studied extensively\nin terms of distance functions and grouping al-\ngorithms. Relatively little work has focused on\nlearning representations for clustering. In this\npaper, we propose Deep Embedded Clustering\n(DEC), a method that simultaneously learns fea-\nture representations and cluster assignments us-\ning deep neural networks. DEC learns a map-\nping from the data space to a lower-dimensional\nfeature space in which it iteratively optimizes a\nclustering objective. Our experimental evalua-\ntions on image and text corpora show signiﬁcant\nimprovement over state-of-the-art methods.\n1. Introduction\nClustering, an essential data analysis a

In [17]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

db=FAISS.from_documents(documents,OllamaEmbeddings())

In [18]:
db

<langchain_community.vectorstores.faiss.FAISS at 0x1dffbae3d50>

In [19]:
query="What is this paper about?"
result = db.similarity_search(query)
result[0].page_content

'for 100000 iterations without dropout. For both layer-wise\npretraining and end-to-end ﬁnetuning of the autoencoder\nthe minibatch size is set to 256, starting learning rate is\nset to 0.1, which is divided by 10 every 20000 iterations,\nand weight decay is set to 0. All of the above param-\neters are set to achieve a reasonably good reconstruction\nloss and are held constant across all datasets. Dataset-\nspeciﬁc settings of these parameters might improve perfor-\nmance on each dataset, but we refrain from this type of\nunrealistic parameter tuning. To initialize centroids, we\nrunk-means with 20 restarts and select the best solution.'

In [20]:
from langchain_community.llms import Ollama
llm=Ollama(model="llama2")
llm

Ollama()

# Design ChatPrompt Template

In [21]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context.
Think step by step before providing a detailed answer.
I will tip you $1000 if the user finds the answer helpful.
<context>
{context}
</context>
Question: {input}""")

In [22]:
# Chain Introduction
## Create Stuff Document Chain

from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain=create_stuff_documents_chain(llm,prompt)


In [23]:
retriever=db.as_retriever()

In [24]:
from langchain.chains import create_retrieval_chain
retriever_chain=create_retrieval_chain(retriever,document_chain)

In [26]:
response=retriever_chain.invoke({"input":"Who are the authors of the paper?"})
response['answer']

'Based on the provided context, the authors of the paper are:\n\n1. Xing, Eric P\n2. Jordan, Michael I\n3. Russell, Stuart\n4. Ng, Andrew Y\n\nThe authors are listed in the reference list at the end of the context passage.'