In [1]:
# Importing the necessary libraries
import numpy as np
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

In [2]:
# Defining the paths
data_path = Path('data')
cleaned_data_path = Path('cleaned_data')

In [5]:
# All the files in the data directory
files = list(data_path.glob("*.html"))
files

[WindowsPath('data/Beyond Good and Evil.html'),
 WindowsPath('data/ECCE HOMO.html'),
 WindowsPath('data/Human, All Too Human.html'),
 WindowsPath('data/The Antichrist.html'),
 WindowsPath('data/The Birth of Tragedy.html'),
 WindowsPath('data/The Genealogy of Morals.html'),
 WindowsPath('data/Thus Spake Zarathustra.html')]

In [8]:
# Function to process, clean and save the files to the cleaned_data folder
def clean_data(files):
    for file in files:
        raw_data = file.read_text(encoding='utf-8', errors='ignore')
        soup = BeautifulSoup(raw_data, 'lxml')
        text_data = soup.get_text(separator=" ", strip=True)
        cleaned_data = re.sub(r"s\+", " ", text_data)
        cleaned_data = re.sub(r"\b(Pg|Page)\.?\s*\d+\b", "", cleaned_data, flags=re.IGNORECASE)
        cleaned_data = cleaned_data.strip()
        
        path_object = cleaned_data_path/f"{file.stem}.txt"
        path_object.write_text(cleaned_data, encoding='utf-8')
        
        print(f"Processed {file.name} -> Saved as {path_object.name} -> {len(cleaned_data.strip())}")

In [9]:
# Calling the clean_data function
clean_data(files)

Processed Beyond Good and Evil.html -> Saved as Beyond Good and Evil.txt -> 431244
Processed ECCE HOMO.html -> Saved as ECCE HOMO.txt -> 288256
Processed Human, All Too Human.html -> Saved as Human, All Too Human.txt -> 238988
Processed The Antichrist.html -> Saved as The Antichrist.txt -> 219285
Processed The Birth of Tragedy.html -> Saved as The Birth of Tragedy.txt -> 345948
Processed The Genealogy of Morals.html -> Saved as The Genealogy of Morals.txt -> 350596
Processed Thus Spake Zarathustra.html -> Saved as Thus Spake Zarathustra.txt -> 693637


In [13]:
# Combining all cleaned text files into a single file
document = ''

for file in list(cleaned_data_path.glob("*.txt")):
    document += f"Start of {file.stem}  "
    document += file.read_text(encoding='utf-8', errors='ignore')
    document += f"End of {file.stem}"

In [24]:
# Number of words in the combined file
len(document.split())

424339

In [25]:
# Adding the combined file to the cleaned_data folder
kb_path = cleaned_data_path/'knowledge_base.txt'
kb_path.write_text(document, encoding='utf-8')

2568336