In [1]:
# Importing the necessary libraries
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

In [2]:
# Defining the paths
data_path = Path('data')
cleaned_data_path = Path('cleaned_data')

In [5]:
# All the files in the data directory
files = list(data_path.glob("*.html"))
files

[WindowsPath('data/Beyond Good and Evil.html'),
 WindowsPath('data/ECCE HOMO.html'),
 WindowsPath('data/Human, All Too Human.html'),
 WindowsPath('data/The Antichrist.html'),
 WindowsPath('data/The Birth of Tragedy.html'),
 WindowsPath('data/The Genealogy of Morals.html'),
 WindowsPath('data/Thus Spake Zarathustra.html')]

In [36]:
# Function to process, clean and save the files to the cleaned_data folder
def clean_data(files):
    for file in files:
        raw_data = file.read_text(encoding='utf-8', errors='ignore')
        soup = BeautifulSoup(raw_data, 'lxml')
        text_data = soup.get_text(separator=" ", strip=True)
        cleaned_data = re.sub(r"s\+", " ", text_data)
        cleaned_data = re.sub(r"\b(Pg|Page)\.?\s*\d+\b", "", cleaned_data, flags=re.IGNORECASE)
        cleaned_data = re.sub(r"\n      ", " ", cleaned_data)
        cleaned_data = cleaned_data.strip()
        
        path_object = cleaned_data_path/f"{file.stem}.txt"
        path_object.write_text(cleaned_data, encoding='utf-8')
        
        print(f"Processed {file.name} -> Saved as {path_object.name} -> {len(cleaned_data.strip())}")

In [37]:
# Calling the clean_data function
clean_data(files)

Processed Beyond Good and Evil.html -> Saved as Beyond Good and Evil.txt -> 400092
Processed ECCE HOMO.html -> Saved as ECCE HOMO.txt -> 288130
Processed Human, All Too Human.html -> Saved as Human, All Too Human.txt -> 238862
Processed The Antichrist.html -> Saved as The Antichrist.txt -> 219159
Processed The Birth of Tragedy.html -> Saved as The Birth of Tragedy.txt -> 345822
Processed The Genealogy of Morals.html -> Saved as The Genealogy of Morals.txt -> 350470
Processed Thus Spake Zarathustra.html -> Saved as Thus Spake Zarathustra.txt -> 653509


In [13]:
# Combining all cleaned text files into a single file
document = ''

for file in list(cleaned_data_path.glob("*.txt")):
    document += f"Start of {file.stem}  "
    document += file.read_text(encoding='utf-8', errors='ignore')
    document += f"End of {file.stem}"

In [24]:
# Number of words in the combined file
len(document.split())

424339

In [25]:
# Adding the combined file to the cleaned_data folder
kb_path = cleaned_data_path/'knowledge_base.txt'
kb_path.write_text(document, encoding='utf-8')

2568336

In [26]:
# Initializaing the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=['\n\n', '\n', ' ', '.', '']
)

In [33]:
knowledge_base_path = Path(r"cleaned_data/knowledge_base.txt")

In [35]:
knowledge_base_path.read_text(encoding='utf-8')[2000:3000]

'TER IV. APOPHTHEGMS AND INTERLUDES CHAPTER V. THE NATURAL HISTORY OF MORALS CHAPTER VI. WE SCHOLARS CHAPTER VII. OUR VIRTUES CHAPTER VIII. PEOPLES AND COUNTRIES CHAPTER IX. WHAT IS NOBLE? FROM THE HEIGHTS PREFACE SUPPOSING that Truth is a woman—what then? Is there not ground for\n      suspecting that all philosophers, in so far as they have been dogmatists,\n      have failed to understand women—that the terrible seriousness and\n      clumsy importunity with which they have usually paid their addresses to\n      Truth, have been unskilled and unseemly methods for winning a woman?\n      Certainly she has never allowed herself to be won; and at present every\n      kind of dogma stands with sad and discouraged mien—IF, indeed, it\n      stands at all! For there are scoffers who maintain that it has fallen,\n      that all dogma lies on the ground—nay more, that it is at its last\n      gasp. But to speak seriously, there are good grounds for hoping that all\n      dogmatizing in phil

In [None]:
chunks = text_splitter.split_text()

In [32]:
(cleaned_data_path/'knowledge_base.txt').read_text(encoding='utf-8', errors='ignore')[5000:6000]

'ainst this error has fostered. It\n      amounted to the very inversion of truth, and the denial of the PERSPECTIVE—the\n      fundamental condition—of life, to speak of Spirit and the Good as\n      Plato spoke of them; indeed one might ask, as a physician: "How did such a\n      malady attack that finest product of antiquity, Plato? Had the wicked\n      Socrates really corrupted him? Was Socrates after all a corrupter of\n      youths, and deserved his hemlock?" But the struggle against Plato, or—to\n      speak plainer, and for the "people"—the struggle against the\n      ecclesiastical oppression of millenniums of Christianity (FOR CHRISTIANITY\n      IS PLATONISM FOR THE "PEOPLE"), produced in Europe a magnificent tension\n      of soul, such as had not existed anywhere previously; with such a tensely\n      strained bow one can now aim at the furthest goals. As a matter of fact,\n      the European feels this tension as a state of distress, and twice attempts\n      have been m