In [4]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from pathlib import Path
from tqdm import tqdm

In [7]:
data_path = Path(r"C:\Users\HI\OneDrive\Documents\GitHub\rag\books")
cleaned_path = Path(r"C:\Users\HI\OneDrive\Documents\GitHub\rag\cleaned_data")

In [9]:
files = list(data_path.glob("*.html"))
files

[WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/Beyond Good and Evil.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/ECCE HOMO.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/Human, All Too Human.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/The Antichrist.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/The Birth of Tragedy.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/The Genealogy of Morals.html'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/books/Thus Spake Zarathustra.html')]

In [14]:
def clean_data(files):
    for file in files:
        raw_data = file.read_text(encoding='utf-8', errors='Ignore')
        soup = BeautifulSoup(raw_data, 'lxml')
        text_data = soup.get_text(separator=" ", strip=True)
        cleaned_data = re.sub(r"\s+", " ", text_data)
        cleaned_data = re.sub(r"\b(Pg|Page)\.?\s*\d+\b", "", cleaned_data, flags=re.IGNORECASE)
        cleaned_data = cleaned_data.strip()
        
        path_obj = cleaned_path / (f"{file.stem}.txt")
        path_obj.write_text(cleaned_data, encoding='utf-8')
        
        print(f"Processed {file.stem} -> Saved to {path_obj.name} -> {len(cleaned_data.strip())} words")

In [15]:
clean_data(files)

Processed Beyond Good and Evil -> Saved to Beyond Good and Evil.txt -> 399166 words
Processed ECCE HOMO -> Saved to ECCE HOMO.txt -> 287874 words
Processed Human, All Too Human -> Saved to Human, All Too Human.txt -> 238796 words
Processed The Antichrist -> Saved to The Antichrist.txt -> 219089 words
Processed The Birth of Tragedy -> Saved to The Birth of Tragedy.txt -> 345756 words
Processed The Genealogy of Morals -> Saved to The Genealogy of Morals.txt -> 350404 words
Processed Thus Spake Zarathustra -> Saved to Thus Spake Zarathustra.txt -> 651426 words


In [18]:
list(cleaned_path.glob("*.txt"))

[WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Beyond Good and Evil.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/ECCE HOMO.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Human, All Too Human.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Antichrist.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Birth of Tragedy.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Genealogy of Morals.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Thus Spake Zarathustra.txt')]

In [19]:
document = ''

for file in list(cleaned_path.glob("*.txt")):
    document += f"Start of {file.stem}\n\n"
    document += file.read_text(encoding='utf-8', errors='ignore')
    document += f"End of {file.stem}\n\n"

In [22]:
document[:1000]

'Start of Beyond Good and Evil\n\nBeyond Good and Evil, by Friedrich Nietzsche The Project Gutenberg eBook of Beyond Good and Evil This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org . If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title : Beyond Good and Evil Author : Friedrich Wilhelm Nietzsche Translator : Helen Zimmern Release date : August 1, 2003 [eBook #4363] Most recently updated: January 9, 2019 Language : English Credits : Produced by John Mamoun, Charles Franks, David Widger and the Online Distributed Proofreading Team *** START OF THE PROJECT GUTENBERG EBOOK BEYOND GOOD AND EVIL *** BEYOND GOOD AND EVIL By Friedrich Nietzsche Translated 

In [23]:
final_db_object = cleaned_path / 'knowledge_base.txt'

In [24]:
final_db_object.write_text(document, encoding='utf-8')

2492907

In [25]:
list(cleaned_path.glob("*.txt"))

[WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Beyond Good and Evil.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/ECCE HOMO.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Human, All Too Human.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/knowledge_base.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Antichrist.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Birth of Tragedy.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/The Genealogy of Morals.txt'),
 WindowsPath('C:/Users/HI/OneDrive/Documents/GitHub/rag/cleaned_data/Thus Spake Zarathustra.txt')]

In [30]:
knowledge_base_path = Path(r"C:\Users\HI\OneDrive\Documents\GitHub\rag\cleaned_data\knowledge_base.txt")

In [32]:
knowledge_base_path.read_text(encoding='utf-8', errors='ignore')[:1000]

'Start of Beyond Good and Evil\n\nBeyond Good and Evil, by Friedrich Nietzsche The Project Gutenberg eBook of Beyond Good and Evil This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org . If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title : Beyond Good and Evil Author : Friedrich Wilhelm Nietzsche Translator : Helen Zimmern Release date : August 1, 2003 [eBook #4363] Most recently updated: January 9, 2019 Language : English Credits : Produced by John Mamoun, Charles Franks, David Widger and the Online Distributed Proofreading Team *** START OF THE PROJECT GUTENBERG EBOOK BEYOND GOOD AND EVIL *** BEYOND GOOD AND EVIL By Friedrich Nietzsche Translated 

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [29]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,
                                               chunk_overlap = 200,
                                               separators = ["\n\n", "\n", ".", " ", ""])

In [37]:
chunks = text_splitter.split_text(knowledge_base_path.read_text(encoding='utf-8', errors='ignore'))

In [48]:
print(f"Total number of chunks created -> {len(chunks)}")

Total number of chunks created -> 3256


In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
model = SentenceTransformer('all-mpnet-base-v2')

In [57]:
embeddings = []

In [58]:
for chunk in tqdm(chunks, desc='Embedding Chunks'):
    vector = model.encode(chunk)
    embeddings.append(vector)

Embedding Chunks: 100%|██████████| 3256/3256 [22:05<00:00,  2.46it/s]


In [61]:
embeddings[1].shape

(768,)

In [71]:
embeddings = np.array(embeddings)

In [74]:
embeddings[1]

array([ 2.37605404e-02,  2.69424152e-02,  1.35689052e-02, -2.30294541e-02,
        3.52771254e-03,  2.25750543e-02, -3.45590487e-02, -3.59090418e-02,
        3.36540490e-02, -2.21780990e-03,  1.61828119e-02,  5.64913265e-03,
       -2.57997308e-02, -6.02208711e-02,  6.24076501e-02, -4.59250174e-02,
        7.04506319e-03,  4.09701616e-02,  2.39608530e-02,  3.65270004e-02,
       -3.16632725e-02,  1.53893819e-02,  3.30377258e-02, -3.19244564e-02,
        7.92644396e-02,  3.98681387e-02, -3.87611939e-03,  3.31625827e-02,
        1.90930944e-02, -2.69599929e-02, -5.59169799e-02, -4.96564731e-02,
       -1.75801907e-02,  5.11434972e-02,  2.26844236e-06, -3.24931294e-02,
        3.50014865e-02, -2.64057070e-02,  1.56410672e-02, -2.71965470e-02,
        5.63886464e-02,  8.02482292e-02, -1.31415315e-02,  1.00663295e-02,
        2.14902591e-02,  1.40320463e-02, -8.29589367e-02,  8.67491066e-02,
       -4.34561260e-02,  5.78383449e-03, -3.05630872e-03, -8.51099491e-02,
        2.47614756e-02, -

In [72]:
embeddings[1].shape

(768,)

In [75]:
np.save("embeddings/embeddings.npy", embeddings)

In [84]:
np.save("embeddings/chunks.npy", np.array(chunks, dtype=object))

In [13]:
import chromadb
from chromadb.utils import embedding_functions

In [17]:
client = chromadb.PersistentClient(path = "vector_db")

In [18]:
collection = client.get_or_create_collection(name = "vector_collection")

In [None]:
collection.add(
    ids = [f"chunk_{i}" for i in range(len(chunks))],
    documents = chunks,
    embeddings = embeddings.tolist()
)

In [90]:
test_query = 'Did Nietzsche believe in God?'
embedded_test_query = model.encode(test_query).tolist()

In [None]:
results = collection.query(
    query_embeddings = [embedded_test_query],
    n_results = 3
)

In [None]:
results

{'ids': [['chunk_1220', 'chunk_1416', 'chunk_566']],
 'embeddings': None,
 'documents': [['. His dreams were thoroughly Hellenic; his whole manner of thinking was Hellenic; his peculiar errors were Hellenic no less. But his Hellenism, I need not add, was anything but the pale neo-Platonism that has run like a thread through the thinking of the Western world since the days of the Christian Fathers. From Plato, to be sure, he got what all of us must get, but his real forefather was Heraclitus. It is in Heraclitus that one finds the germ of his primary view of the universe—a view, to wit, that sees it, not as moral phenomenon, but as mere aesthetic representation. The God that Nietzsche imagined, in the end, was not far from the God that such an artist as Joseph Conrad imagines—a supreme craftsman, ever experimenting, ever coming closer to an ideal balancing of lines and forces, and yet always failing to work out the final harmony',
   '. God as a domestic servant, as a letter carrier, as

In [99]:
results['documents']

[['. His dreams were thoroughly Hellenic; his whole manner of thinking was Hellenic; his peculiar errors were Hellenic no less. But his Hellenism, I need not add, was anything but the pale neo-Platonism that has run like a thread through the thinking of the Western world since the days of the Christian Fathers. From Plato, to be sure, he got what all of us must get, but his real forefather was Heraclitus. It is in Heraclitus that one finds the germ of his primary view of the universe—a view, to wit, that sees it, not as moral phenomenon, but as mere aesthetic representation. The God that Nietzsche imagined, in the end, was not far from the God that such an artist as Joseph Conrad imagines—a supreme craftsman, ever experimenting, ever coming closer to an ideal balancing of lines and forces, and yet always failing to work out the final harmony',
  '. God as a domestic servant, as a letter carrier, as an almanac-man—at bottom, he is a mere name for the stupidest sort of chance.... “Divine

In [23]:
import requests
import json

In [None]:
# Load the embeddings and chunks everytime you comeback.
embeddings = np.load("embeddings/embeddings.npy", allow_pickle=True)
chunks = np.load("embeddings/chunks.npy", allow_pickle=True)

In [None]:
def ask_nietzsche(question, top_k = 3):
    
    embedded_question = model.encode(question).tolist()
    
    results = collection.query(
        query_embeddings = [embedded_question],
        n_results = top_k
    )
    
    retrieved_text = "\n\n".join(results['documents'][0])
    
    prompt = f"""
    Answer the following question as if you are Nietzsche strictly based on the context below. Give the answer in a extremely concise and accurate way.
    Context:
    {retrieved_text}
    
    Question:
    {question}
    """
    
    url = "http://localhost:11434/api/generate"
    payload = {
        'model' : 'phi3',
        'prompt' : prompt,
        'max_tokens' : 512
    }
    
    response = requests.post(url, json = payload)
    
    lines = response.text.strip().split("\n")  # split the streaming JSON lines

    # reconstruct full response
    full_text = ""
    for line in lines:
        try:
            data = json.loads(line)
            if "response" in data and data["response"]:
                full_text += data["response"]
        except json.JSONDecodeError:
            pass  # ignore any malformed lines

    return full_text

In [26]:
ask_nietzsche('What is the main theme in Nietzsche\'s works?')

'The main theme in Nietzsche\'s works revolves around individualism and personal experience as a source of knowledge. He emphasizes self-reliance over traditional moral values, which he deems to be baseless "idols." His writing often explores the idea that societal norms can stifle human potential for greatness. Nietzsche encourages readers to critically examine and possibly reject conventional morality in favor of creating their own life-affirming principles.'