### MultiIndexRetriever - Retrieve Full Documents using Documents Summary

In [1]:
from langchain_community.document_loaders import TextLoader , DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI


loader = DirectoryLoader('./source', glob="./*.py", loader_cls=TextLoader)
docs = loader.load()

In [2]:
docs

[Document(page_content="from datetime import datetime\nfrom flask_login import LoginManager, UserMixin, login_user, logout_user, current_user\nfrom werkzeug.security import generate_password_hash, check_password_hash\nfrom app import db\n\nclass User(db.Model, UserMixin):\n    id = db.Column(db.Integer, primary_key=True)\n    username = db.Column(db.String(80), unique=True, nullable=False)\n    password_hash = db.Column(db.String(100), nullable=False)\n    emails = db.relationship('Email', backref='user', lazy='dynamic', foreign_keys='Email.sender_id')\n    emails_received = db.relationship('Email', foreign_keys='Email.recipient_id', backref='recipient', lazy='dynamic')\n\n    def set_password(self, password):\n        self.password_hash = generate_password_hash(password)\n\n    def check_password(self, password):\n        return check_password_hash(self.password_hash, password)\n\nclass Email(db.Model):\n    id = db.Column(db.Integer, primary_key=True)\n    sender_id = db.Column(db.In

In [3]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
#from utils.llm import LLM
llm = ChatOpenAI(model="gpt-4-1106-preview")

summ_prompt = ChatPromptTemplate.from_template("Summarize the following document in concise and meaningful manner:\n {doc}")
#from utils.llm import LLM
llm = ChatOpenAI(model="gpt-4-1106-preview")
#llm = LLM().get_llama_together()

In [4]:
chain = (
    {"doc": lambda x: x.page_content}
    | summ_prompt
    | llm
    | StrOutputParser()
)

summaries = chain.batch(docs)

In [5]:
summaries[0]

"The document defines a Python module with classes for a database model using Flask, a web framework, and SQLAlchemy, an ORM (Object-Relational Mapping) tool. It includes imports for handling dates, user sessions, and password security.\n\nThe `User` class represents a user entity with an ID, username, and password hash stored in a database. It includes methods to set and verify passwords using hash functions. Each user can have a relationship with sent emails (`emails`) and received emails (`emails_received`), indicating a one-to-many relationship with the `Email` class.\n\nThe `Email` class represents an email entity with its own ID, sender ID, recipient ID, subject, body, and timestamp. It uses foreign keys to reference the `User` class for both the sender and recipient.\n\nThe `Error` class represents an error record with an ID, error code, message, timestamp, and a reference to an email ID, indicating a many-to-one relationship with the `Email` class.\n\nOverall, this module is de

In [6]:
summaries[1]

'This document is a Python script that uses the Flask web framework to create a simple email web service with user authentication and email management functionalities. The script imports necessary modules for handling email sending and creating web routes.\n\n1. User authentication: It defines a `load_user` function to retrieve a user by ID. There are routes for user login and registration. The login route authenticates the user by checking the provided username and password, while the registration route adds a new user to the database.\n\n2. Email operations: It includes routes for composing, sending, viewing the inbox, viewing the outbox, and searching through emails. The compose route allows a user to create a new email, which is then sent using the SMTP protocol and added to the database. The inbox and outbox routes retrieve emails from the database and display them to the user. The search_emails function allows users to search for emails by subject.\n\n3. Session management: The s

In [7]:
from langchain.embeddings import HuggingFaceBgeEmbeddings


model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

  from tqdm.autonotebook import tqdm, trange


In [8]:
from langchain.storage import InMemoryByteStore
from langchain_community.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries",
                     embedding_function=embeddings)

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"

# The retriever
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)

In [9]:
# creating doc ids which is to be stored as metadata in vectore store along with summaries
doc_ids = [str(uuid.uuid4()) for _ in docs]

# Docs linked to summaries
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

# Add
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [10]:
query = "tell me about my data models"
sub_docs = vectorstore.similarity_search(query,k=1)
sub_docs

[Document(page_content="The document defines a Python module with classes for a database model using Flask, a web framework, and SQLAlchemy, an ORM (Object-Relational Mapping) tool. It includes imports for handling dates, user sessions, and password security.\n\nThe `User` class represents a user entity with an ID, username, and password hash stored in a database. It includes methods to set and verify passwords using hash functions. Each user can have a relationship with sent emails (`emails`) and received emails (`emails_received`), indicating a one-to-many relationship with the `Email` class.\n\nThe `Email` class represents an email entity with its own ID, sender ID, recipient ID, subject, body, and timestamp. It uses foreign keys to reference the `User` class for both the sender and recipient.\n\nThe `Error` class represents an error record with an ID, error code, message, timestamp, and a reference to an email ID, indicating a many-to-one relationship with the `Email` class.\n\nOve

In [11]:
sub_docs[0].metadata['doc_id']

'8e02cfbc-0bee-4b15-bfe1-dc43563a02f8'

In [12]:
retrieved_docs = retriever.get_relevant_documents(query,n_results=1)

print(retrieved_docs[0].page_content)

  warn_deprecated(
Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


from datetime import datetime
from flask_login import LoginManager, UserMixin, login_user, logout_user, current_user
from werkzeug.security import generate_password_hash, check_password_hash
from app import db

class User(db.Model, UserMixin):
    id = db.Column(db.Integer, primary_key=True)
    username = db.Column(db.String(80), unique=True, nullable=False)
    password_hash = db.Column(db.String(100), nullable=False)
    emails = db.relationship('Email', backref='user', lazy='dynamic', foreign_keys='Email.sender_id')
    emails_received = db.relationship('Email', foreign_keys='Email.recipient_id', backref='recipient', lazy='dynamic')

    def set_password(self, password):
        self.password_hash = generate_password_hash(password)

    def check_password(self, password):
        return check_password_hash(self.password_hash, password)

class Email(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    sender_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=Fa