In [54]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from pathlib import Path
from tqdm import tqdm

In [7]:
data_path = Path(r"C:\Users\HI\OneDrive\Documents\GitHub\rag\books")
cleaned_path = Path(r"C:\Users\HI\OneDrive\Documents\GitHub\rag\cleaned_data")

In [9]:
files = list(data_path.glob("*.html"))
files

[WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/Beyond Good and Evil.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/ECCE HOMO.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/Human, All Too Human.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/The Antichrist.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/The Birth of Tragedy.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/The Genealogy of Morals.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/Thus Spake Zarathustra.html')]

In [14]:
def clean_data(files):
    for file in files:
        raw_data = file.read_text(encoding='utf-8', errors='Ignore')
        soup = BeautifulSoup(raw_data, 'lxml')
        text_data = soup.get_text(separator=" ", strip=True)
        cleaned_data = re.sub(r"\s+", " ", text_data)
        cleaned_data = re.sub(r"\b(Pg|Page)\.?\s*\d+\b", "", cleaned_data, flags=re.IGNORECASE)
        cleaned_data = cleaned_data.strip()
        
        path_obj = cleaned_path / (f"{file.stem}.txt")
        path_obj.write_text(cleaned_data, encoding='utf-8')
        
        print(f"Processed {file.stem} -> Saved to {path_obj.name} -> {len(cleaned_data.strip())} words")

In [15]:
clean_data(files)

Processed Beyond Good and Evil -> Saved to Beyond Good and Evil.txt -> 399166 words
Processed ECCE HOMO -> Saved to ECCE HOMO.txt -> 287874 words
Processed Human, All Too Human -> Saved to Human, All Too Human.txt -> 238796 words
Processed The Antichrist -> Saved to The Antichrist.txt -> 219089 words
Processed The Birth of Tragedy -> Saved to The Birth of Tragedy.txt -> 345756 words
Processed The Genealogy of Morals -> Saved to The Genealogy of Morals.txt -> 350404 words
Processed Thus Spake Zarathustra -> Saved to Thus Spake Zarathustra.txt -> 651426 words


In [18]:
list(cleaned_path.glob("*.txt"))

[WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Beyond Good and Evil.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/ECCE HOMO.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Human, All Too Human.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Antichrist.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Birth of Tragedy.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Genealogy of Morals.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Thus Spake Zarathustra.txt')]

In [19]:
document = ''

for file in list(cleaned_path.glob("*.txt")):
    document += f"Start of {file.stem}\n\n"
    document += file.read_text(encoding='utf-8', errors='ignore')
    document += f"End of {file.stem}\n\n"

In [22]:
document[:1000]

'Start of Beyond Good and Evil\n\nBeyond Good and Evil, by Friedrich Nietzsche The Project Gutenberg eBook of Beyond Good and Evil This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org . If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title : Beyond Good and Evil Author : Friedrich Wilhelm Nietzsche Translator : Helen Zimmern Release date : August 1, 2003 [eBook #4363] Most recently updated: January 9, 2019 Language : English Credits : Produced by John Mamoun, Charles Franks, David Widger and the Online Distributed Proofreading Team *** START OF THE PROJECT GUTENBERG EBOOK BEYOND GOOD AND EVIL *** BEYOND GOOD AND EVIL By Friedrich Nietzsche Translated 

In [23]:
final_db_object = cleaned_path / 'knowledge_base.txt'

In [24]:
final_db_object.write_text(document, encoding='utf-8')

2492907

In [25]:
list(cleaned_path.glob("*.txt"))

[WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Beyond Good and Evil.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/ECCE HOMO.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Human, All Too Human.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/knowledge_base.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Antichrist.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Birth of Tragedy.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Genealogy of Morals.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Thus Spake Zarathustra.txt')]

In [30]:
knowledge_base_path = Path(r"C:\Users\HI\OneDrive\Documents\GitHub\rag\cleaned_data\knowledge_base.txt")

In [32]:
knowledge_base_path.read_text(encoding='utf-8', errors='ignore')[:1000]

'Start of Beyond Good and Evil\n\nBeyond Good and Evil, by Friedrich Nietzsche The Project Gutenberg eBook of Beyond Good and Evil This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org . If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title : Beyond Good and Evil Author : Friedrich Wilhelm Nietzsche Translator : Helen Zimmern Release date : August 1, 2003 [eBook #4363] Most recently updated: January 9, 2019 Language : English Credits : Produced by John Mamoun, Charles Franks, David Widger and the Online Distributed Proofreading Team *** START OF THE PROJECT GUTENBERG EBOOK BEYOND GOOD AND EVIL *** BEYOND GOOD AND EVIL By Friedrich Nietzsche Translated 

In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [29]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,
                                               chunk_overlap = 200,
                                               separators = ["\n\n", "\n", ".", " ", ""])

In [37]:
chunks = text_splitter.split_text(knowledge_base_path.read_text(encoding='utf-8', errors='ignore'))

In [48]:
print(f"Total number of chunks created -> {len(chunks)}")

Total number of chunks created -> 3256


In [55]:
from sentence_transformers import SentenceTransformer

In [56]:
model = SentenceTransformer('all-mpnet-base-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [57]:
embeddings = []

In [58]:
for chunk in tqdm(chunks, desc='Embedding Chunks'):
    vector = model.encode(chunk)
    embeddings.append(vector)

Embedding Chunks: 100%|██████████| 3256/3256 [22:05<00:00,  2.46it/s]
