In [1]:
!pip install sentence-transformers transformers gutenberg-cleaner openai

Collecting gutenberg-cleaner
  Downloading gutenberg_cleaner-0.1.6-py3-none-any.whl.metadata (1.7 kB)
Downloading gutenberg_cleaner-0.1.6-py3-none-any.whl (7.4 kB)
Installing collected packages: gutenberg-cleaner
Successfully installed gutenberg-cleaner-0.1.6


In [2]:
import os
import json
import torch
import csv
import warnings
import logging
import requests
import numpy as np
import pickle
import time
import glob
from tqdm import tqdm
import unicodedata

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer # Use the model's own tokenizer

from gutenberg_cleaner import simple_cleaner
import re
from openai import OpenAI
from collections import Counter


from google.colab import drive


CONFIG = {
    "drive_base_path": "/content/drive/MyDrive/",
    "input_paths": {
        "biology_chapters": "data-process/output/biology_chapters_cleaned/",
        "biology_pages": "data-process/output/biology_pages/",
        "physics_chapters": "data-process/output/physics_chapters_cleaned/",
        "physics_pages": "data-process/output/physics_pages/",
        "chemistry_chapters": "data-process/output/chemistry_chapters_cleaned/",
        "chemistry_pages": "data-process/output/chemistry_pages/",
    },
    "output_base_path": "embeddings_output/",
}


class BookEmbeddingGenerator:
    """
    Generates embeddings for large text files (books) using either
    OpenAI's or Nomic's/Qwen's embedding models. It handles text chunking
    for models with context limits.
    """

    def __init__(
            self,
            use_nomic=False,
            use_openai=True,
            chunk_size: int = 8192,
            openai_model: str = "text-embedding-3-small"
    ):
        """
        Initializes the embedding generator.
        """
        self.use_nomic = use_nomic
        self.use_openai = use_openai
        self.chunk_size = chunk_size

        self.overlap = int(chunk_size * 0.20)
        self.openai_model = openai_model

        if use_nomic:
            self.model_name = "Qwen/Qwen3-Embedding-0.6B"

            logging.warning(f"Loading Qwen model: {self.model_name}")

            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                trust_remote_code=True
            )

            self.model = SentenceTransformer(
                self.model_name,
                trust_remote_code=True,
                device="cuda"
            )

            self.embedding_dim = self.model.get_sentence_embedding_dimension()
            logging.warning(f"Model loaded. Embedding dimension: {self.embedding_dim}")

        elif use_openai:
            self.model_name = "open_ai"
            raise NotImplementedError("OpenAI path needs tokenizer integration, focusing on Qwen.")
        else:
            raise ValueError("An embedding model must be selected (use_nomic or use_openai).")

    def _chunks(self, text: str):
        """
        Splits a text into overlapping chunks based on token count.
        """
        all_tokens = self.tokenizer.encode(text, add_special_tokens=False)

        token_counter = Counter(all_tokens)

        token_statistics = {
            "total_tokens": len(all_tokens),
            "unique_tokens": len(token_counter),
            "most_common_tokens": token_counter.most_common(20),
            "token_counts": {self.tokenizer.decode([token]): count for token, count in token_counter.items()}
        }

        chunks = []
        step = self.chunk_size - self.overlap

        for start in range(0, len(all_tokens), step):
            chunk_tokens = all_tokens[start: start + self.chunk_size]

            chunks.append(self.tokenizer.decode(chunk_tokens, skip_special_tokens=True))

            if start + self.chunk_size >= len(all_tokens):
                break

        return chunks, token_statistics

    def get_embedding(self, text: str):
        """
        Generates an embedding for the given text.
        """
        if self.use_nomic:
            all_embeds = []
            pieces, token_statistics = self._chunks(text)

            if not pieces:
                logging.warning("No text pieces found after chunking. Returning zero vector.")
                zero_embed = np.zeros(self.embedding_dim, dtype=np.float32)
                return zero_embed, [], token_statistics

            for piece in pieces:
                embed = self.model.encode(piece, convert_to_numpy=True)
                all_embeds.append(embed)

            return np.mean(all_embeds, axis=0), all_embeds, token_statistics

        return None, None, None

    def process_whole_book_from_text(self, text: str):
        """
        Generates an embedding for a book given its text content.
        """
        text = self.clean_book_text(text)
        return self.get_embedding(text)

    def clean_book_text(self, text: str, remove_special_chars=False):
        """
        Simplified cleaner for pre-cleaned .md files.
        """
        return text.strip()


def save_embedding(
    output_path: str,
    all_embeddings_path: str,
    token_statistics_path: str,
    embedding,
    all_embeddings=None,
    token_statistics=None
):
    """
    Saves a single book embedding and its related files to the specified paths.
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    os.makedirs(os.path.dirname(all_embeddings_path), exist_ok=True)
    os.makedirs(os.path.dirname(token_statistics_path), exist_ok=True)

    token_statistics_dict = {"token_statistics": token_statistics}

    embedding = np.array(embedding, dtype=np.float32)
    all_embeddings = np.array(all_embeddings, dtype=np.float32) if all_embeddings is not None else None

    np.save(output_path, embedding)
    if all_embeddings is not None:
        np.save(all_embeddings_path, all_embeddings)
    with open(token_statistics_path, 'wb') as f:
        pickle.dump(token_statistics_dict, f)


def process_book(
    generator: BookEmbeddingGenerator,
    book_text: str,
    book_name: str,
    output_path: str,
    all_embeddings_path: str,
    token_statistics_path: str
):
    """
    Orchestrates the embedding generation process for a single book.
    """
    try:
        whole_book_embedding, all_embeddings, token_statistics = generator.process_whole_book_from_text(book_text)

        save_embedding(
            output_path,
            all_embeddings_path,
            token_statistics_path,
            whole_book_embedding,
            all_embeddings,
            token_statistics
        )
    except Exception as e:
        logging.warning(f"An error occurred while processing {book_name}: {str(e)}")
        raise e


def main():
    """
    Main function to run the .md file processing pipeline from Google Drive.
    """
    try:
        logging.warning("Mounting Google Drive...")
        drive.mount('/content/drive')
        logging.warning("Drive mounted successfully.")
    except Exception as e:
        logging.error(f"Failed to mount Google Drive: {e}")
        return

    chunk_sizes = [4096, 8192]
    logging.warning(f"Will process ONLY for large chunk sizes: {chunk_sizes}")

    for category, input_suffix in CONFIG["input_paths"].items():
        logging.warning(f"\nProcessing category: {category}")
        logging.warning("=" * 50)

        input_dir = os.path.join(CONFIG["drive_base_path"], input_suffix)

        output_dir_base = os.path.join(
            CONFIG["drive_base_path"],
            CONFIG["output_base_path"],
            category
        )

        md_files = glob.glob(os.path.join(input_dir, "*.md"))
        if not md_files:
            logging.warning(f"No .md files found in {input_dir}. Skipping.")
            continue

        logging.warning(f"Found {len(md_files)} .md files in {input_dir}")

        for chunk_size in chunk_sizes:
            logging.warning(f"\n---\nProcessing for chunk size: {chunk_size}\n---")

            generator = BookEmbeddingGenerator(
                use_nomic=True,
                use_openai=False,
                chunk_size=chunk_size
            )

            model_name_safe = generator.model_name.replace('/', '_')

            chunk_output_dir = os.path.join(
                output_dir_base,
                model_name_safe,
                f"chunk_{chunk_size}"
            )

            logging.warning(f"Outputting to: {chunk_output_dir}")

            for file_path in tqdm(md_files, desc=f"Chunk {chunk_size}"):

                try:
                    book_name = os.path.basename(file_path).replace('.md', '')

                    output_path = os.path.join(chunk_output_dir, "embeddings", f"{book_name}_embedding.npy")
                    all_embeddings_path = os.path.join(chunk_output_dir, "all_embeddings", f"{book_name}_all_embeddings.npy")
                    token_statistics_path = os.path.join(chunk_output_dir, "token_statistics", f"{book_name}_token_statistics.pkl")

                    if os.path.exists(output_path):
                        continue

                    with open(file_path, 'r', encoding='utf-8') as f:
                        book_text = f.read()

                    if not book_text.strip():
                        logging.warning(f"File {book_name} is empty. Skipping.")
                        continue

                    process_book(
                        generator,
                        book_text,
                        book_name,
                        output_path,
                        all_embeddings_path,
                        token_statistics_path
                    )
                except Exception as e:
                    logging.warning(f"An error occurred while processing {file_path}: {str(e)}")
                    continue
                finally:
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                logging.warning(f"Cleared CUDA cache after chunk size {chunk_size}.")


if __name__ == "__main__":
    logging.basicConfig(level=logging.WARNING)
    main()

Processing category: biology_chapters


Mounted at /content/drive


---
Processing for chunk size: 4096
---
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Chunk 4096: 100%|██████████| 47/47 [03:31<00:00,  4.49s/it]
---
Processing for chunk size: 8192
---
Chunk 8192: 100%|██████████| 47/47 [04:16<00:00,  5.46s/it]
Processing category: biology_pages
---
Processing for chunk size: 4096
---
Chunk 4096: 100%|██████████| 1487/1487 [03:05<00:00,  8.03it/s]
---
Processing for chunk size: 8192
---
Chunk 8192: 100%|██████████| 1487/1487 [02:32<00:00,  9.75it/s]
Processing category: physics_chapters
---
Processing for chunk size: 4096
---
Chunk 4096: 100%|██████████| 34/34 [03:26<00:00,  6.08s/it]
---
Processing for chunk size: 8192
---
Chunk 8192: 100%|██████████| 34/34 [04:33<00:00,  8.05s/it]
Processing category: physics_pages
---
Processing for chunk size: 4096
---
Chunk 4096: 100%|██████████| 1697/1697 [04:09<00:00,  6.81it/s]
---
Processing for chunk size: 8192
---
Chunk 8192: 100%|██████████| 1697/1697 [03:27<00:00,  8.20it/s]
Processing category: chemistry_chapters
---
Processing for chunk size: 4096
---
Chunk 4096: 100%|██████████| 21/21 [

In [2]:
import os
import json
import torch
import csv
import warnings
import logging
import requests
import numpy as np
import pickle
import time
import glob
from tqdm import tqdm
import unicodedata

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

from gutenberg_cleaner import simple_cleaner
import re
from openai import OpenAI
from collections import Counter


from google.colab import drive


CONFIG = {
    "drive_base_path": "/content/drive/MyDrive/",


    "input_paths": {

        "biology_pages": "data-process/output/biology_pages/",
        "physics_chapters": "data-process/output/physics_chapters_cleaned/",
        "physics_pages": "data-process/output/physics_pages/",
        "chemistry_chapters": "data-process/output/chemistry_chapters_cleaned/",
        "chemistry_pages": "data-process/output/chemistry_pages/",
    },


    "output_base_path": "embeddings_output/",
}


class BookEmbeddingGenerator:
    """
    Generates embeddings for large text files (books) using either
    OpenAI's or Nomic's/Qwen's embedding models. It handles text chunking
    for models with context limits.
    """

    def __init__(
            self,
            use_nomic=False,
            use_openai=True,
            chunk_size: int = 8192,
            # overlap: int = 200,
            openai_model: str = "text-embedding-3-small"
    ):
        """
        Initializes the embedding generator.
        """
        self.use_nomic = use_nomic
        self.use_openai = use_openai
        self.chunk_size = chunk_size

        self.overlap = int(chunk_size * 0.20)
        self.openai_model = openai_model

        if use_nomic:
            self.model_name = "Qwen/Qwen3-Embedding-0.6B"  # Using the 8B model as in your file

            logging.warning(f"Loading Qwen model: {self.model_name}")

            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                trust_remote_code=True
            )

            self.model = SentenceTransformer(
                self.model_name,
                trust_remote_code=True,
                device="cuda"
            )


            self.embedding_dim = self.model.get_sentence_embedding_dimension()
            logging.warning(f"Model loaded. Embedding dimension: {self.embedding_dim}")

        elif use_openai:
            self.model_name = "open_ai"
            with open("keys.txt") as f:
                api_key = f.read().strip()
            self.client = OpenAI(api_key=api_key)
            raise NotImplementedError("OpenAI path needs tokenizer integration, focusing on Qwen.")

        else:
            raise ValueError("An embedding model must be selected (use_nomic or use_openai).")

    def _chunks(self, text: str):
        """
        Splits a text into overlapping chunks based on token count.

        Args:
            text (str): The input text to be chunked.

        Returns:
            list[str]: A list of text chunks.
        """

        all_tokens = self.tokenizer.encode(text, add_special_tokens=False)


        token_counter = Counter(all_tokens)

        token_statistics = {
            "total_tokens": len(all_tokens),
            "unique_tokens": len(token_counter),
            "most_common_tokens": token_counter.most_common(20),
            "token_counts": {self.tokenizer.decode([token]): count for token, count in token_counter.items()}
        }

        chunks = []

        step = self.chunk_size - self.overlap


        for start in range(0, len(all_tokens), step):
            chunk_tokens = all_tokens[start: start + self.chunk_size]

            chunks.append(self.tokenizer.decode(chunk_tokens, skip_special_tokens=True))

            if start + self.chunk_size >= len(all_tokens):
                break

        return chunks, token_statistics

    def get_embedding(self, text: str):
        """
        Generates an embedding for the given text. For large texts,
        it splits the text into chunks and averages their embeddings.

        Args:
            text (str): The input text.

        Returns:
            np.ndarray: The generated embedding as a NumPy array.
        """
        if self.use_nomic:
            all_embeds = []

            pieces, token_statistics = self._chunks(text)

            if not pieces:
                logging.warning("No text pieces found after chunking (text might be empty). Returning zero vector.")
                zero_embed = np.zeros(self.embedding_dim, dtype=np.float32)
                return zero_embed, [], token_statistics

            for piece in pieces:
                embed = self.model.encode(piece, convert_to_numpy=True)
                all_embeds.append(embed)

            return np.mean(all_embeds, axis=0), all_embeds, token_statistics

        if self.use_openai:
            all_embeds = []
            pieces, token_statistics = self._chunks(text)
            for piece in pieces:
                resp = self.client.embeddings.create(
                    input=[piece],
                    model=self.openai_model,
                    encoding_format="float"
                )
                all_embeds.append(np.array(resp.data[0].embedding))
            return np.mean(all_embeds, axis=0), all_embeddings, token_statistics

        return None, None, None


    def process_whole_book_from_text(self, text: str):
        """
        Generates an embedding for a book given its text content.

        Args:
            text (str): The entire text of the book.

        Returns:
            np.ndarray: The embedding for the book.
        """
        text = self.clean_book_text(text)
        return self.get_embedding(text)

    def clean_book_text(self, text: str, remove_special_chars=False):
        """
        Cleans the book text.
        New: Simplified to just strip whitespace, as .md files are pre-cleaned.
        """
        return text.strip()


def save_embedding(
    output_path: str,
    all_embeddings_path: str,
    token_statistics_path: str,
    embedding,
    all_embeddings=None,
    token_statistics=None
):
    """
    Saves a single book embedding and its related files to the specified paths.
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    os.makedirs(os.path.dirname(all_embeddings_path), exist_ok=True)
    os.makedirs(os.path.dirname(token_statistics_path), exist_ok=True)

    token_statistics_dict = {"token_statistics": token_statistics}

    embedding = np.array(embedding, dtype=np.float32)
    all_embeddings = np.array(all_embeddings, dtype=np.float32) if all_embeddings is not None else None

    np.save(output_path, embedding)
    if all_embeddings is not None:
        np.save(all_embeddings_path, all_embeddings)

    with open(token_statistics_path, 'wb') as f:
        pickle.dump(token_statistics_dict, f)



def process_book(
    generator: BookEmbeddingGenerator,
    book_text: str,
    book_name: str,
    output_path: str,
    all_embeddings_path: str,
    token_statistics_path: str
):
    """
    Orchestrates the embedding generation process for a single book.
    """

    try:
        whole_book_embedding, all_embeddings, token_statistics = generator.process_whole_book_from_text(book_text)


        save_embedding(
            output_path,
            all_embeddings_path,
            token_statistics_path,
            whole_book_embedding,
            all_embeddings,
            token_statistics
        )

    except Exception as e:
        logging.warning(f"An error occurred while processing {book_name}: {str(e)}")


def main():
    """
    Main function to run the .md file processing pipeline from Google Drive.
    """
    try:
        logging.warning("Mounting Google Drive...")
        drive.mount('/content/drive')
        logging.warning("Drive mounted successfully.")
    except Exception as e:
        logging.error(f"Failed to mount Google Drive: {e}")
        return


    chunk_sizes = [256 * (2**i) for i in range(4)]
    logging.warning(f"Will process for (safer) chunk sizes: {chunk_sizes}")

    for category, input_suffix in CONFIG["input_paths"].items():
        logging.warning(f"\nProcessing category: {category}")
        logging.warning("=" * 50)

        input_dir = os.path.join(CONFIG["drive_base_path"], input_suffix)

        output_dir_base = os.path.join(
            CONFIG["drive_base_path"],
            CONFIG["output_base_path"],
            category
        )

        md_files = glob.glob(os.path.join(input_dir, "*.md"))
        if not md_files:
            logging.warning(f"No .md files found in {input_dir}. Skipping.")
            continue

        logging.warning(f"Found {len(md_files)} .md files in {input_dir}")

        for chunk_size in chunk_sizes:
            logging.warning(f"\n---\nProcessing for chunk size: {chunk_size}\n---")

            generator = BookEmbeddingGenerator(
                use_nomic=True,
                use_openai=False,
                chunk_size=chunk_size
            )

            model_name_safe = generator.model_name.replace('/', '_')

            chunk_output_dir = os.path.join(
                output_dir_base,
                model_name_safe,
                f"chunk_{chunk_size}"
            )

            logging.warning(f"Outputting to: {chunk_output_dir}")

            for file_path in tqdm(md_files, desc=f"Chunk {chunk_size}"):

                try:
                    book_name = os.path.basename(file_path).replace('.md', '')

                    output_path = os.path.join(chunk_output_dir, "embeddings", f"{book_name}_embedding.npy")
                    all_embeddings_path = os.path.join(chunk_output_dir, "all_embeddings", f"{book_name}_all_embeddings.npy")
                    token_statistics_path = os.path.join(chunk_output_dir, "token_statistics", f"{book_name}_token_statistics.pkl")

                    if os.path.exists(output_path):
                        continue

                    with open(file_path, 'r', encoding='utf-8') as f:
                        book_text = f.read()

                    if not book_text.strip():
                        logging.warning(f"File {book_name} is empty. Skipping.")
                        continue

                    process_book(
                        generator,
                        book_text,
                        book_name,
                        output_path,
                        all_embeddings_path,
                        token_statistics_path
                    )
                except Exception as e:
                    logging.warning(f"An error occurred while processing {file_path}: {str(e)}")
                    continue
                finally:
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                logging.warning(f"Cleared CUDA cache after chunk size {chunk_size}.")


if __name__ == "__main__":
    logging.basicConfig(level=logging.WARNING)
    main()


Processing category: biology_pages


Mounted at /content/drive


---
Processing for chunk size: 256
---
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Chunk 256: 100%|██████████| 1487/1487 [00:34<00:00, 43.27it/s]
---
Processing for chunk size: 512
---
Chunk 512: 100%|██████████| 1487/1487 [00:07<00:00, 189.23it/s]
---
Processing for chunk size: 1024
---
Chunk 1024: 100%|██████████| 1487/1487 [02:34<00:00,  9.63it/s]
---
Processing for chunk size: 2048
---
Chunk 2048: 100%|██████████| 1487/1487 [02:31<00:00,  9.84it/s]
Processing category: physics_chapters
---
Processing for chunk size: 256
---
Chunk 256: 100%|██████████| 34/34 [03:55<00:00,  6.93s/it]
---
Processing for chunk size: 512
---
Chunk 512: 100%|██████████| 34/34 [02:08<00:00,  3.77s/it]
---
Processing for chunk size: 1024
---
Chunk 1024: 100%|██████████| 34/34 [02:02<00:00,  3.60s/it]
---
Processing for chunk size: 2048
---
Chunk 2048: 100%|██████████| 34/34 [02:19<00:00,  4.11s/it]
Processing category: physics_pages
---
Processing for chunk size: 256
---
Chunk 256: 100%|██████████| 1697/1697 [07:36<00:00,  3.72it/s]
---
Processing for chunk size: 512
---
Chunk 512: 100%|

In [4]:
# This cell installs the libraries we need
# scikit-learn (sklearn) is for the analysis (PCA, t-SNE, K-Means)
# plotly is for the interactive visualizations
!pip install scikit-learn plotly pandas



In [5]:
import os
import glob
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.express as px
from google.colab import drive
import random

try:
    print("Mounting Google Drive...")
    drive.mount('/content/drive')
except Exception as e:
    print(f"Error mounting drive: {e}")


BASE_EMBEDDING_PATH = "/content/drive/MyDrive/embeddings_output/"
SAMPLE_SIZE = 2000


print(f"Loading all embeddings from: {BASE_EMBEDDING_PATH}")

embedding_list = []
label_list = []
subject_list = []
model_list = []
chunk_list = []

search_pattern = os.path.join(BASE_EMBEDDING_PATH, "**", "embeddings", "*_embedding.npy")
file_paths = glob.glob(search_pattern, recursive=True)

if not file_paths:
    print("\n---!! ERROR !!----")
    print(f"No .npy files found using the pattern: {search_pattern}")
else:
    print(f"Found {len(file_paths)} total embedding files.")


    if len(file_paths) > SAMPLE_SIZE:
        print(f"File list is too large. Taking a random sample of {SAMPLE_SIZE} files.")
        file_paths = random.sample(file_paths, SAMPLE_SIZE)

    print(f"Now loading {len(file_paths)} files...")

    for file_path in file_paths:
        try:
            embedding = np.load(file_path)
            embedding_list.append(embedding)

            filename = os.path.basename(file_path)
            label = filename.replace("_embedding.npy", "")
            label_list.append(label)

            embeddings_dir = os.path.dirname(file_path)
            chunk_dir = os.path.dirname(embeddings_dir)
            model_dir = os.path.dirname(chunk_dir)
            subject_dir = os.path.dirname(model_dir)

            chunk_list.append(os.path.basename(chunk_dir))
            model_list.append(os.path.basename(model_dir))
            subject_list.append(os.path.basename(subject_dir))

        except Exception as e:
            print(f"Error loading or parsing {file_path}: {e}")

    X = np.vstack(embedding_list)

    print(f"Loaded {len(label_list)} embeddings.")
    print(f"Data shape (files, dimensions): {X.shape}")


    print(f"Running t-SNE on {len(label_list)} points... This might take a minute.")
    tsne = TSNE(
        n_components=2,
        perplexity=30,
        random_state=42,
        n_jobs=-1
    )
    X_tsne = tsne.fit_transform(X)


    df = pd.DataFrame()
    df['label'] = label_list
    df['subject'] = subject_list
    df['model'] = model_list
    df['chunk_size'] = chunk_list
    df['tsne_x'] = X_tsne[:, 0]
    df['tsne_y'] = X_tsne[:, 1]

    print("DataFrame created. Ready to plot.")

    print("Generating Plot 1: Colored by Subject")
    fig1 = px.scatter(
        df,
        x='tsne_x',
        y='tsne_y',
        color='subject',
        hover_name='label',
        hover_data=['chunk_size', 'model', 'subject'],
        title='t-SNE Visualization (2,000 Random Samples), Colored by Subject',
        template='plotly_dark'
    )
    fig1.update_layout(height=800)
    fig1.show()

    print("Generating Plot 2: Colored by Chunk Size")
    fig2 = px.scatter(
        df,
        x='tsne_x',
        y='tsne_y',
        color='chunk_size',
        hover_name='label',
        hover_data=['chunk_size', 'model', 'subject'],
        title='t-SNE Visualization (2,000 Random Samples), Colored by Chunk Size',
        template='plotly_dark'
    )
    fig2.update_layout(height=800)
    fig2.show()

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading all embeddings from: /content/drive/MyDrive/embeddings_output/
Found 17744 total embedding files.
File list is too large. Taking a random sample of 2000 files.
Now loading 2000 files...
Loaded 2000 embeddings.
Data shape (files, dimensions): (2000, 1024)
Running t-SNE on 2000 points... This might take a minute.
DataFrame created. Ready to plot.
Generating Plot 1: Colored by Subject


Generating Plot 2: Colored by Chunk Size
