LOADER

In [4]:
from langchain_community.document_loaders.json_loader import JSONLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.document_loaders.youtube import YoutubeLoader
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain_community.document_loaders.word_document import Docx2txtLoader

In [5]:
json_path = '../bible/bible_3.json'
word_path = 'docs/Document.docx'
pdf_path = 'docs/7 ESPRITS.pdf'
youtube_1 = 'https://www.youtube.com/watch?v=y9k-U9AuDeM'
youtube_2 = 'https://www.youtube.com/watch?v=pePAAGfh-IU'
youtube_3 = 'https://www.youtube.com/watch?v=CiSaY2xl9V4'


Custom loader

In [6]:
from typing import AsyncIterator, Iterator
import json

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document


class BibleLoader(BaseLoader):
    """An example document loader that reads a file line by line."""

    def __init__(self, file_path: str) -> None:
        """Initialize the loader with a file path.

        Args:
            file_path: The path to the file to load.
        """
        self.file_path = file_path

    def lazy_load(self) -> Iterator[Document]:  # <-- Does not take any arguments
        """A lazy loader that reads a file line by line.

        When you're implementing lazy load methods, you should use a generator
        to yield documents one by one.
        """
        bible =  json.load(open(self.file_path, encoding='utf-8'))
        for version in bible:
            for book in bible[version]:
                text = bible[version][book]['text']
                yield Document(
                page_content=text,
                metadata={"version": version, "book": book})


In [7]:
loader = BibleLoader(
    file_path=json_path)

bible_data = loader.load()

Docx2txtLoader

In [9]:
loader = Docx2txtLoader(file_path=word_path)
word_data = loader.load()

TextLoader

In [6]:
# loader = TextLoader(file_path=pdf_path)
# word_data = loader.load()

PyPDFLoader

In [8]:
loader = PyPDFLoader(file_path=pdf_path) #, extract_images=True)
pdf_data = loader.load()

YoutubeLoader

In [8]:
# youtube = [youtube_1, youtube_2, youtube_3]

In [9]:
# loader = YoutubeLoader(video_id=youtube_1,
#                        add_video_info =True)
# ytb_data = loader.load()

In [10]:
# loader = YoutubeAudioLoader(urls=youtube,save_dir='./audio')
# ytb_data = loader.yield_blobs()
# next(ytb_data)

Spliter

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, 
                                               chunk_overlap=50)


In [11]:
pdf_splited_data = text_splitter.split_documents(pdf_data)
word_splited_data = text_splitter.split_documents(word_data)
bible_splited_data = text_splitter.split_documents(bible_data)

In [13]:
def print_doc(doc_list, limit = 10):
    for i, elem in enumerate(doc_list):
        print(f"Document n° : {i}, Meta: {elem.metadata}")# \n Content: {elem.page_content}")
        if i == limit:
            break
    print(len(doc_list))

Embeddings

In [23]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

In [None]:
# from langchain_community.llms.llamacpp import LlamaCpp
# n_gpu_layers = 1  # Metal set to 1 is enough.
# n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

# # Make sure the model path is correct for your system!
# llm = LlamaCpp(
#     model_path="C:/Users/sem.eglohlokoh/Documents/vscode/rag/models/capybarahermes-2.5-mistral-7b.Q2_K.gguf",
#     n_gpu_layers=n_gpu_layers,
#     n_batch=n_batch,
#     n_ctx=2048,
#     f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
#     verbose=True,
# )

In [None]:
ollama_embedor = OllamaEmbeddings(model="gemma:2b", top_k=30, top_p=0.7, show_progress=True,
                                 num_thread=4 )
# gemma:2b
# llama3
from sentence_transformers import SentenceTransformer
# embd_model = SentenceTransformer("models/multilingual-e5-small", device='cpu')
# fast_embedor = FastEmbedEmbeddings()
# model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
# dangvantuan/sentence-camembert-base
# dangvantuan/sentence-camembert-large

In [43]:
embeddings = ollama_embedor.embed_documents(pdf_splited_data)

OllamaEmbeddings: 100%|██████████| 5/5 [01:54<00:00, 22.92s/it]


In [44]:
embedded_query = ollama_embedor.embed_query("What was the name mentioned in the document?")


OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.65s/it]


Vector store

In [20]:
from langchain_community.vectorstores.faiss import FAISS

In [21]:
db = FAISS.from_documents(pdf_splited_data, ollama_embedor)
db.save_local("pdf_gemma_faiss")

OllamaEmbeddings: 100%|██████████| 136/136 [57:19<00:00, 25.29s/it]


In [None]:
db = FAISS.from_documents(pdf_splited_data, embd_model)
db.save_local("pdf_e5-small_faiss")

In [None]:
embd_model = SentenceTransformer("models/multilingual-e5-large-instruct", device='cpu')
db = FAISS.from_documents(pdf_splited_data, embd_model)
db.save_local("pdf_e5-large_faiss")

In [46]:
query = "What is saide about Task creation and management in the document?"
docs = db.similarity_search(query, k=3)


OllamaEmbeddings: 100%|██████████| 1/1 [00:04<00:00,  4.00s/it]


In [47]:
retriever = db.as_retriever()

retriever.invoke(query)

OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.38s/it]


[Document(page_content="Document 1: Functional Specification for a Mobile App for Task Management\n\nPurpose: To outline the features and functionalities of a mobile application for managing tasks and projects.\n\nScope: Define the app's target users, platforms, and integration with other tools or services.\n\nFeatures:\n\nTask creation and management: Detailed description of task creation, editing, categorization, prioritization, assignment, and completion tracking.\n\nProject management: Organization of tasks into projects, project creation and editing, progress tracking, and dependency management.\n\nCollaboration: Features for team collaboration, including task assignment, commenting, notifications, and file sharing.\n\nReminders and alerts: Implementation of reminders, push notifications, and due date alerts for tasks and projects.\n\nSearch and filtering: Robust search functionality to find tasks and projects based on various criteria, including keywords, tags, and assignees.", m

In [48]:


# new_db = FAISS.load_local("faiss_index", embeddings, asynchronous=True)

# docs = await new_db.asimilarity_search(query)

# docs[0]