In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_openai import OpenAI
from langchain.chains.retrieval_qa.base import VectorDBQA
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
import pandas as pd

In [3]:
import os 
from dotenv import load_dotenv
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [4]:
loader = DirectoryLoader("./dexian_data",loader_cls=PyPDFLoader)
pages = loader.load_and_split()
text_splitter= CharacterTextSplitter(chunk_size=900, chunk_overlap=100)
docs = text_splitter.split_documents(pages)

embeddings = OpenAIEmbeddings()

  embeddings = OpenAIEmbeddings()


In [5]:
db = FAISS.from_documents(docs,embeddings)

### QnA

In [6]:
retriever = db.as_retriever()
model = RetrievalQAWithSourcesChain.from_chain_type(llm=OpenAI(),chain_type='stuff',retriever=retriever)

In [7]:
question = "Is AI/ML mentioned?"
response = model({"question":question}, return_only_outputs=True)
print("Answer :",response['answer'])
print("Sources :", response['sources'])

  response = model({"question":question}, return_only_outputs=True)


Answer :  Yes, AI/ML is mentioned.

Sources : dexian_data\Consolidated Case Studies (1).pdf<|endoftext|>When I was a child, my parents would often take me and my siblings to visit our grandparents who lived in a small village in the countryside. It was always a fun adventure for us as we got to explore the vast fields, play in the nearby river, and help our grandpa tend to his vegetable garden.


In [8]:
save_directory = "Store"
db.save_local(save_directory)

In [9]:
new_db = FAISS.load_local(save_directory,embeddings,allow_dangerous_deserialization=True)

In [10]:
#vector store into df 
def store_to_df (store):
    v_dict = store.docstore._dict
    data_rows = []
    for k in v_dict.keys():
        doc_name = v_dict[k].metadata['source'].split('/')[-1]
        page_number = v_dict[k].metadata[ 'page']+1
        content = v_dict[k].page_content
        data_rows. append ({"chunk_id" :k, "document" :doc_name, "page": page_number, "content" : content})
    vector_df = pd.DataFrame(data_rows)
    return vector_df

def show_vstore(store):
    vector_df = store_to_df(store)
    display(vector_df.head())

In [11]:
show_vstore(new_db)

Unnamed: 0,chunk_id,document,page,content
0,9165944d-08f9-4fe2-bff5-c10b9376ee7c,dexian_data\Consolidated Case Studies (1).pdf,1,"©Dexian, LLC.2024 Dexian.com\nCase Studies\n2024"
1,eeaa66b5-53d7-44df-b795-10cc42469ae2,dexian_data\Consolidated Case Studies (1).pdf,2,"©Dexian, LLC. Dexian.com\n©Dexian, LLC.2024 De..."
2,f747bb97-6b18-4925-8cf3-62b96e297700,dexian_data\Consolidated Case Studies (1).pdf,3,"©Dexian, LLC. Dexian.com\nData Migration and s..."
3,bd14d91f-ea93-4c9a-8553-404593ed5c5e,dexian_data\Consolidated Case Studies (1).pdf,4,"©Dexian, LLC. Dexian.com\nData Foundation Supp..."
4,69765988-c867-49f5-827b-801c30a62e6f,dexian_data\Consolidated Case Studies (1).pdf,5,"©Dexian, LLC. Dexian.com\nIOT & AI Model data ..."


In [13]:
def delete_document(store, document):
    vector_df = store_to_df(store)
    chunks_list = vector_df.loc[vector_df['document']==document]['chunk_id'].tolist()
    store.delete(chunks_list)

def refresh_model(new_store):
    retriever = new_store.as_retriever()
    model = RetrievalQAWithSourcesChain.from_chain_type(llm=OpenAI, chain_type='stuff',retriever=retriever)
    return model

In [14]:
delete_document(new_db, 'dexian_data\Consolidated Case Studies (1).pdf')

  delete_document(new_db, 'dexian_data\Consolidated Case Studies (1).pdf')


In [18]:
show_vstore(new_db)

In [16]:
model = refresh_model(new_db)

<langchain_community.vectorstores.faiss.FAISS at 0x130d8783b10>

In [17]:
question = "Is AI/ML mentioned?"
response = model({"question":question}, return_only_outputs=True)
print("Answer :",response['answer'])
print("Sources :", response['sources'])

Answer :  Yes, AI/ML is mentioned.

Sources : dexian_data\Consolidated Case Studies (1).pdf<|endoftext|>//1. FizzBuzz
