LOADER

In [1]:
from langchain_community.document_loaders.json_loader import JSONLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.document_loaders.youtube import YoutubeLoader
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain_community.document_loaders.word_document import Docx2txtLoader

In [13]:
json_path = 'docs/bible_3.json'
word_path = 'docs/Document.docx'
pdf_path = 'docs/7 ESPRITS.pdf'
youtube_1 = 'y9k-U9AuDeM'
youtube_2 = 'pePAAGfh-IU'
youtube_3 = 'CiSaY2xl9V4'


In [37]:
from pathlib import Path
def split_ext(uploaded_file:str):
    """
    """
    path = Path(uploaded_file)
    name = path.stem
    extension = path.suffix
    return name, extension

split_ext(json_path)

('bible_3', '.json')

Custom loader

In [2]:
from typing import Iterator
import json

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document


class BibleLoader(BaseLoader):
    """An example document loader that reads a file line by line."""

    def __init__(self, file_path: str) -> None:
        """Initialize the loader with a file path.

        Args:
            file_path: The path to the file to load.
        """
        self.file_path = file_path

    def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
        """A lazy loader that reads a file line by line.

        When you're implementing lazy load methods, you should use a generator
        to yield documents one by one.
        """
        bible =  json.load(open(self.file_path, encoding='utf-8'))
        for version in bible:
            for book in bible[version]:
                text = bible[version][book]['text']
                yield Document(
                page_content=text,
                metadata={"version": version, "book": book})


In [6]:
loader = BibleLoader(
    file_path=json_path)

bible_data = loader.load()

Docx2txtLoader

In [9]:
loader = Docx2txtLoader(file_path=word_path)
word_data = loader.load()

TextLoader

In [6]:
# loader = TextLoader(file_path=pdf_path)
# word_data = loader.load()

PyPDFLoader

In [8]:
loader = PyPDFLoader(file_path=pdf_path) #, extract_images=True)
pdf_data = loader.load()

YoutubeLoader

In [14]:
loader = YoutubeLoader(video_id=youtube_1,
                       add_video_info =True)
ytb_data = loader.load()

Spliter

In [20]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, 
                                               chunk_overlap=50)


In [22]:
pdf_splited_data = text_splitter.split_documents(pdf_data)
bible_splited_data = text_splitter.split_documents(bible_data)

In [13]:
def print_doc(doc_list, limit = 10):
    for i, elem in enumerate(doc_list):
        print(f"Document n° : {i}, Meta: {elem.metadata}")# \n Content: {elem.page_content}")
        if i == limit:
            break
    print(len(doc_list))

Embeddings

In [23]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

In [None]:
from langchain_community.llms.llamacpp import LlamaCpp
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="C:/Users/sem.eglohlokoh/Documents/vscode/rag/models/capybarahermes-2.5-mistral-7b.Q2_K.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    verbose=True,
)

In [26]:
ollama_embedor = OllamaEmbeddings(model="gemma:2b", top_k=30, top_p=0.7, show_progress=True,
                                 num_thread=4 )
# gemma:2b
# llama3
from sentence_transformers import SentenceTransformer
# embd_model = SentenceTransformer("models/multilingual-e5-small", device='cpu')
# fast_embedor = FastEmbedEmbeddings()
# model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
# dangvantuan/sentence-camembert-base
# dangvantuan/sentence-camembert-large

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 37718.56it/s]


In [None]:
fast_embedor = FastEmbedEmbeddings()
#intfloat/multilingual-e5-large

In [None]:
embeddings = ollama_embedor.embed_documents(pdf_splited_data)

In [44]:
embedded_query = ollama_embedor.embed_query("What was the name mentioned in the document?")


OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.65s/it]


Vector store

In [28]:
from langchain_community.vectorstores.faiss import FAISS

In [21]:
db = FAISS.from_documents(pdf_splited_data, ollama_embedor)
db.save_local("pdf_gemma_faiss")

OllamaEmbeddings: 100%|██████████| 136/136 [57:19<00:00, 25.29s/it]


In [30]:
db = FAISS.from_documents(pdf_splited_data, fast_embedor)
db.save_local("pdf_fast_faiss")

In [33]:
query = "quels sont les septs esprits"
docs = db.similarity_search(query, k=3)


In [34]:
retriever = db.as_retriever()

retriever.invoke(query)

[Document(page_content='dans différentes manifestations. Le même terme, «sept esprits» \nest également utilisé pour le décrire dans Apocalypse 1: 4, \nApocalypse 4: 5 et Apocalypse 5: 6. \nRévélation 1: 4: \n«Jean aux sept églises qui sont en Asie: grâce et paix à \nvous, de celui qui est, qui était et qui doit venir; et des sept \nesprits qui sont devant son trône»; \nRévélation 4: 5: \n"Et du trône sortaient des éclairs, des tonnerres et des \nvoix: et il y avait sept lampes de feu qui brûlaient devant le \ntrône, qui sont les sept Esprits de Dieu." \nRévélation 5: 6: \n«Et je vis, et voici, au milieu du trône et des quatre \nbêtes, et au milieu des anciens, se tenait un agneau comme \nil avait été tué, ayant sept cornes et sept yeux, qui sont les \nsept esprits de Dieu envoyé sur toute la terre. " \nLes sept Esprits de Dieu font référence à sept \nmanifestations indépendantes du Saint-Esprit. Ce n\'est \ncependant pas la même chose que lorsque nous disons que la \ndivinité est compo