In [1]:
import pandas as pd

# load data
data = pd.read_csv("data/scrapped_guttenberg_religion_paranormal.csv")
data.shape

(4467, 37)

In [2]:
# filter out data wo Summary for dataframe loader
data = data.loc[~data.Summary.isnull()]

In [3]:
from langchain_community.document_loaders import DataFrameLoader

loader = DataFrameLoader(data, page_content_column="Summary")
docs = loader.load()

In [4]:
# docs[877].page_content
# docs[877].metadata

In [5]:
# usin csv loader, metadata is useless
# from langchain_community.document_loaders.csv_loader import CSVLoader
# loader_csv = CSVLoader("data/scrapped_guttenberg_travel_geography.csv")
# docs_csv = loader_csv.load()
# docs_csv
# docs_csv[766].page_content
# docs_csv[766].metadata

In [6]:
# load embedding from azure
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY_4")
azure_openai_api_endpoint = os.getenv("AZURE_OPENAI_API_ENDPOINT_4")
deployment_name = os.getenv("AZURE_DEPLOYMENT_NAME_4")

from langchain.embeddings import AzureOpenAIEmbeddings 
embedding_model = AzureOpenAIEmbeddings(openai_api_key=azure_openai_api_key,
                                    azure_deployment='text-embedding-3-large',
                                    azure_endpoint=azure_openai_api_endpoint,
                                    openai_api_version="2023-05-15",
                                    chunk_size=500
)

  embedding_model = AzureOpenAIEmbeddings(openai_api_key=azure_openai_api_key,


#### FAISS documentation
https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.faiss.FAISS.html#langchain_community.vectorstores.faiss.FAISS

In [8]:
# using FAISS, we keep all information
from langchain_community.vectorstores import FAISS

metadatas = [doc.metadata for doc in docs]
texts = [doc.page_content for doc in docs]
faiss_vector_store = FAISS.from_texts(texts,embedding_model,metadatas)
faiss_vector_store.save_local('data','religion_paranormal')

### IMPORTANT : use compressor when usiong the full text

USING FAISS HAS SEVERAL ADVANTAGE FOR RETRIEVAL
* max_marginal_relevance_search_with_score_by_vector()
* similarity_search_with_score_by_vector()

In [47]:
# similarity_search_with_score_by_vector(embedding: List[float], 
#                                        k: int = 4, 
#                                        filter: Callable | Dict[str, Any] | None = None, 
#                                        fetch_k: int = 20, **kwargs: Any) 
# returns → List[Tuple[Document, float]]

# similarity_search_with_score(query: str, 
#                              k: int = 4, 
#                              filter: Callable | Dict[str, Any] | None = None, 
#                              fetch_k: int = 20, 
#                              **kwargs: Any) 
# returns → List[Tuple[Document, float]]

similar = faiss_vector_store.similarity_search_with_score("Jesus is back and alive, he will smash all",
                                   k=4,
                                   fetch_k=20)

on verifie que l'embeding fait aussi les metadata

In [52]:
for sim in similar:
    print("score",sim[1])
    print("content", sim[0].page_content)
    print()

score 1.3410089

score 1.351626
content "Jesus, The Messiah; or, the Old Testament Prophecies Fulfilled in the New" by an anonymous author is a religious text written in the early 19th century. This work explores the connections between the Old Testament prophecies and their fulfillment in the person of Jesus Christ as described in the New Testament. The author, aiming to deepen the understanding of these biblical truths, likely addresses theological themes related to Jesus’ identity as the Messiah and his role in salvation.  The opening of the book begins with an introduction and dedication, offering insights into the author's motivations for writing. The author expresses a desire to make the Scriptures more familiar to young readers. The first chapter delves into Genesis 3:15, introducing the concept of enmity between "the seed of the woman" and the serpent, which the author interprets as a foreshadowing of Christ’s victory over evil. The subsequent chapters continue to build upon th

# Faiss vectorstore can be used as retreiver

In [None]:
retriever = faiss_vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 1, "fetch_k": 2, "lambda_mult": 0.5},
)
retriever.invoke("thud")

# load local

In [None]:
# load_local(folder_path: str, 
#            embeddings: Embeddings, 
#            index_name: str = 'index', 
#            *, 
#            allow_dangerous_deserialization: bool = False, 
#            **kwargs: Any) 
# returns → FAISS

In [None]:
faiss_vector_store = FAISS.load_local('data', 
                                embeddings=embedding_model
                                , index_name = 'travel_geography')