In [1]:
import os
import fitz
import re
from tqdm.auto import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pandas as pd
import tiktoken
from sentence_transformers import SentenceTransformer
from datasets import Dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datasets import load_dataset
from pinecone import pinecone, ServerlessSpec
from uuid import uuid4
pc = pinecone.Pinecone(api_key="pcsk_7B7VXN_6M4qLKUbxBrU4iCXs5VVy4ZCQCoTJUNJayD2EJa6PeqGygBfxzBb64YL2D56C9U")

In [None]:
pdf_folder = "/home/shegun93/Klasshour_Rags/Physics"
def text_formatter(text: str) -> str:
    cleaned = text.replace("\n"," ").strip()
    return cleaned

In [None]:
def parse_filename(filename: str) -> str:
    """
    Extracts the base name without extension from the filename.
    :param filename: The name of the file
    :return: Base name of the file
    """
    filename = os.path.splitext(filename)[0]
    filename = re.sub(r'[^a-zA-Z0-9\s]', ' ', filename)
    filename = re.sub(r'\s+', ' ', filename)
    name = filename.lower()
    name = name.title()

    #name = filename.split("_")
    return {
        "Subject": "Physics",
        "topic": name,
    }


In [None]:
def extract_all_pages(pdf_folder: str) -> list[dict]:
    """Extract text and metadata from all PDFs in a folder."""
    pages = []

    for file in os.listdir(pdf_folder):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, file)
            filename = os.path.basename(pdf_path)
            metadata = parse_filename(filename)

            document = fitz.open(pdf_path)

            for page_number, page in tqdm(enumerate(document), desc=filename):
                raw_text = page.get_text()
                cleaned_text = text_formatter(raw_text)

                page_data = {
                    "text": cleaned_text,
                    "subject": metadata.get["Subject"]
                    "topic": metadata.get["topic"]
                }
                pages.append(page_data)


    return pages

In [None]:
print("📄 Extracting pages...")
all_pages = extract_all_pages(pdf_folder)

In [None]:
import random
random.sample(all_pages, k=4)

In [None]:
# output_path = "/home/shegun93/Klasshour_Rags/Data"
# dataset = Dataset.from_list(all_pages)
# dataset.save_to_disk(output_path)
# print(f"Dataset saved to {output_path}")

In [None]:
# output_path = "/home/shegun93/Klasshour_Rags/Physics_dataset"
# if not os.path.exists(output_path):
#     os.makedirs(output_path)
# output_path = os.path.join(output_path, "physics_dataset")

In [None]:
from datasets import Dataset

output_path = "/home/shegun93/Klasshour_Rags/data.json"
dataset = Dataset.from_list(all_pages)
dataset.to_json(output_path)

print(f"Dataset saved to {output_path}")

In [None]:
dataset = load_dataset("json", data_files="/home/shegun93/Klasshour_Rags/data.json", split="train")

In [None]:
dataset

In [None]:
# df = dataset[0]
# meta = df.get("metadata")
# print(meta)
# print(meta.get("topic"))

In [None]:
# embedding_model = SentenceTransformer("all-mpnet-base-v2")
# topic_model = BERTopic(embedding_model=embedding_model, verbose=True)
# topics, _ = topic_model.fit_transform(texts)

In [None]:
# for i, page in enumerate(all_pages):
#     topic_id = topics[i]
#     if topic_id != -1:
#         topic_name = topic_model.get_topic(topic_id)[0][0]
#     else:
#         topic_name = "Unknown"
#     page["metadata"]["inferred_topic"] = topic_name

# print(f"✅ Assigned inferred topics to {len(all_pages)} pages.")

In [None]:
def token_length(self, text):
    # Correct usage of the tiktoken tokenizer
    tokens = self.tokenizer.encode(text)  # Remove disallowed_special argument
    return len(tokens)

In [2]:
class DataIngestion:
    def __init__(self, index=None, tokenizer=None, data=None, text_splitter=None, embeddings=None, batch_limit=None):
        self.index = index
        self.tokenizer = tokenizer or tiktoken.get_encoding(encoding_name="cl100k_base")
        self.embeddings = embeddings or SentenceTransformer("all-mpnet-base-v2")
        self.text_splitter = text_splitter or RecursiveCharacterTextSplitter(
            chunk_size=400,
            chunk_overlap=20,
            length_function=self.token_length,
            separators=["\n\n", "\n", " ", ""]
        )
        self.batch_limit = 100
        self.data = data or load_dataset("json", data_files="/home/shegun93/Klasshour_Rags/data.json", split="train")

    def token_length(self, text):
        tokens = self.tokenizer.encode(text)
        return len(tokens)
    
    def get_metadata(self, page):
        metadata = page.get("metadata")
        return {
            "Subject": metadata.get("Subject"),
            "topic": metadata.get("topic")
        }
    

    def split_texts_and_get_metadata(self, page):
        basic_metadata = self.get_metadata(page)
        texts = self.text_splitter.split_text(page.get("text"))
        metadatas = [
            {"topic": topic, "text": text, **basic_metadata}
            for topic, text in enumerate(texts)
        ]
        return texts,  metadatas

    def upload_batch(self, texts, metadatas):
        ids =[str(uuid4()) for _ in range(len(texts))]
        embeddings = self.embeddings.encode(texts)
        self.index.upsert(vectors=zip(ids, 
                                      embeddings, 
                                      metadatas
                                     ))
    def batch_upload(self):
        batch_texts = []
        batch_metadatas = []
        for page in tqdm(self.data):
            texts, metadatas = self.split_texts_and_get_metadata(page)
            batch_texts.extend(texts)
            batch_metadatas.extend(metadatas)
            if len(batch_texts) >= self.batch_limit:
                self.upload_batch(batch_texts, batch_metadatas)
                batch_texts = []
                batch_metadatas = []

        if len(batch_texts) > 0:
            self.upload_batch(batch_texts, batch_metadatas)

In [None]:
#embeddings.embed_documents = lambda *args, **kwargs: embeddinggs.encode(*args, **kwargs).tolist()

In [3]:
index_name = "klasshour"
if index_name not in pc.list_indexes().names():
    pc.create_index(index_name,
                    metric="cosine",
                    dimension=768,
                    spec = ServerlessSpec(
                        cloud="aws",
                        region="us-east-1",
                    ))
    index = pc.Index(index_name)
    print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [4]:
data_ingestion = DataIngestion(index, embeddings=SentenceTransformer("all-mpnet-base-v2"))
data_ingestion.batch_upload()
print(index.describe_index_stats())

  0%|          | 0/6753 [00:00<?, ?it/s]

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 13743}},
 'total_vector_count': 13743,
 'vector_type': 'dense'}


In [7]:
query = "What is work, Energy and Power"
embedding = data_ingestion.embeddings.encode(query).tolist()

In [8]:
results = index.query(vector=embedding, top_k=3, include_metadata=True)

In [29]:
for result in results["matches"]:
    print(f"Retrival: {result['metadata']}")
    #print(f"Text: {result['metadata']['text']}\n")

Retrival: {'Subject': 'Physics', 'text': 'The language of power is subtle and different from that of work. Recall that work is done on an object and results in a transfer of energy to that object. The rate of this energy transfer, or power, is often referred to as the power that is generated in doing the work. The term “power” not only applies to the rate at which energy is transferred from one object to another or transformed from one form to another, but also to the rate at which energy is transported from one location to another. For example, electric power lines carry electric energy across vast stretches of land.  Energy Transformations • MHR 277 The unit, the watt, was named in  honour of the Scottish engineer, James Watt, who made such great improvements in the steam engine that it hastened the Industrial Revolution. The ability to do work did not change, but the rate at which the work  could be accomplished did. Watt did experiments with strong dray horses and determined that t