# RAG from Scratch

**Cài đặt thư viện**

In [1]:
from PyPDF2 import PdfReader
from typing import List, Dict, Any

- Nhận pdf_path theo kiểu string
- Khởi tạo ds lưu trữ pdf_pages
- "rb" - read binary

In [2]:
def extract_text_from_pdf(pdf_path: str) -> str:
    pdf_pages = []
    with open(pdf_path, "rb") as file:
        pdf_reader = PdfReader(file) # Hàm đọc file PDF
        for page in pdf_reader.pages: #đọc từng trang trong file PDF
            text = page.extract_text()
            pdf_pages.append(text)

    pdf_text = "\n".join(pdf_pages)

    return pdf_text

- Thư viện để thực hiện HTTP requests
- gửi request để nhận response --> Tải pdf từ link url về
- pdf_path để đặt tên cái pdf đó
- "wb" - write binary
- viết nội dung vào file local ở pdf_path

In [3]:
import requests 

pdf_url = 'https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf'
response = requests.get(pdf_url)

pdf_path = 'attention_is_all_you_need.pdf'
with open(pdf_path, 'wb') as file:
    file.write(response.content)

In [4]:
pdf_text = extract_text_from_pdf(pdf_path[:300])

In [5]:
print(pdf_text[:300])  # In ra 300 ký tự đầu tiên của văn bản PDF để kiểm tra

Attention Is All You Need
Ashish Vaswani
Google Brain
avaswani@google.comNoam Shazeer
Google Brain
noam@google.comNiki Parmar
Google Research
nikip@google.comJakob Uszkoreit
Google Research
usz@google.com
Llion Jones
Google Research
llion@google.comAidan N. Gomezy
University of Toronto
aidan@c


# Chunk Text

In [6]:
import re
from collections import deque

- Trả về danh sách các đoạn văn bản

In [7]:
def text_chunk(text:str, max_length: int = 1000) -> List[str]:
    sentences = deque(re.split(r'(?<=[.!?])\s+', text.replace('\n', ' ')))
    chunks = []
    chunk_text = ""
    while sentences:
        sentence = sentences.popleft().strip()
        if len(chunk_text) + len(sentence) > max_length and chunk_text:
            chunks.append(chunk_text)
            chunk_text = sentence
        else:
            chunk_text += " " + sentence
    if chunk_text: 
        chunks.append(chunk_text)
    return chunks                                              

In [8]:
chunks = text_chunk(pdf_text)

In [9]:
print(f"Number of chunks ={len(chunks)}")
print(chunks[0])

Number of chunks =36
 Attention Is All You Need Ashish Vaswani Google Brain avaswani@google.comNoam Shazeer Google Brain noam@google.comNiki Parmar Google Research nikip@google.comJakob Uszkoreit Google Research usz@google.com Llion Jones Google Research llion@google.comAidan N. Gomezy University of Toronto aidan@cs.toronto.eduŁukasz Kaiser Google Brain lukaszkaiser@google.com Illia Polosukhinz illia.polosukhin@gmail.com Abstract The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring signiﬁcantly less time to t

# Vector Store

In [10]:
import chromadb
from chromadb.utils import embedding_functions
from chromadb.api.models import Collection

In [11]:
from sentence_transformers import SentenceTransformer
class SentenceTransformerEmbeddingFunction(embedding_functions.EmbeddingFunction):
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed(self, texts: List[str]) -> List[List[float]]:
        embeddings = self.model.encode(input)
        return embeddings.tolist()

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def create_vector_store(db_path: str) -> Collection:
    client = chromadb.PersistentClient(path=db_path)
    embeddings = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
    db = client.create_collection(
        name="pdf_chunks", 
        embedding_function=embeddings
    )
    return db

In [None]:
import os
import uuid
