In [30]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings

import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore


In [15]:
load_dotenv()
os.environ['GEMINI_API_KEY']=os.getenv("GEMINI_API_KEY")

In [16]:
# Load PDF 
loader = PyPDFLoader("data/OPENMDAO_An_Open_Source_Framework_for_Multidisciplinary_Analysis_and_Optimization.pdf")
docs = loader.load()

In [17]:
docs

[Document(metadata={'producer': 'pdfTeX-1.40.10', 'creator': 'TeX', 'creationdate': '2010-09-01T08:45:30-04:00', 'moddate': '2010-09-01T08:45:30-04:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-1.40.10-2.2 (TeX Live 2009) kpathsea version 5.0.0', 'source': 'data/OPENMDAO_An_Open_Source_Framework_for_Multidisciplinary_Analysis_and_Optimization.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1'}, page_content='OpenMDAO: An Open Source Framework for\nMultidisciplinary Analysis and Optimization\nJustin Gray∗\nMDAO Branch, NASA Glenn Research Center, Cleveland, OH\nKenneth T. Moore†and Bret A. Naylor‡\nDB Consulting Group, Inc., Cleveland, OH\nThis paper describes the progress made in the development of OpenMDAO, an open\nsource framework for performing Multidisciplinary Analysis and Optimization (MDAO).\nNASA intends to use OpenMDAO to aid in the design of unconventional aircraft, but the\ngeneral structure and methods may be applied to solve any number

In [23]:
# create a splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,#hyperparameter
    chunk_overlap=50 #hyperparemeter
)

In [19]:
# split the documents
split_docs = splitter.split_documents(docs)
len(split_docs)

90

In [24]:
MODEL_NAME='text-embedding-004'

In [33]:
# Get the embeddings model
embeddings = GoogleGenerativeAIEmbeddings(model=MODEL_NAME, google_api_key=os.environ["GEMINI_API_KEY"])

In [None]:
# Create the vector store 
# (InMemory store in FAISS)

index=faiss.IndexFlatL2(384)

#vectorstore = FAISS.from_documents(split_docs, embeddings)

vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

retriever = vectorstore.as_retriever()