<a href="https://colab.research.google.com/github/WilliamShengYangHuang/AALU_Workshop_3/blob/main/Text_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title
# Required installations
# !pip install transformers scikit-learn pandas numpy torch plotly

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


## Enter the text folder path below, then select a dimensionality reduction method for data projection.



In [13]:
# Load pre-trained model and tokenizer (e.g., 'bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Function to extract embeddings for each token in the text (split into chunks if too long)
def get_token_embeddings(text, model, tokenizer, max_length=512):
    inputs = tokenizer(text, return_tensors="pt", truncation=False, padding=False)
    input_ids = inputs["input_ids"][0]

    # If input is longer than max_length, split into chunks
    num_chunks = (len(input_ids) // max_length) + 1
    all_tokens = []
    all_embeddings = []

    for i in range(num_chunks):
        chunk = input_ids[i * max_length: (i + 1) * max_length]
        if len(chunk) == 0:
            continue
        # Get tokens for the chunk (without decoding)
        tokens = tokenizer.convert_ids_to_tokens(chunk)
        all_tokens.extend(tokens)

        with torch.no_grad():
            outputs = model(input_ids=chunk.unsqueeze(0))
        chunk_embeddings = outputs.last_hidden_state[0].numpy()  # (num_tokens_in_chunk, embedding_dim)
        all_embeddings.extend(chunk_embeddings)

    return all_tokens, all_embeddings

# Read all TXT files from the folder and return their content
def read_text_files(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                texts.append(text)
    return texts

# Main process: Convert each token into embeddings and save to CSV
def process_tokens_to_csv(folder_path, output_csv_path, method='pca'):
    texts = read_text_files(folder_path)

    all_tokens = []
    all_embeddings = []

    # Get embeddings for all tokens in all texts
    for text in texts:
        tokens, embeddings = get_token_embeddings(text, model, tokenizer)
        all_tokens.extend(tokens)
        all_embeddings.extend(embeddings)

    all_embeddings = np.array(all_embeddings)

    # Adjust n_components based on the data
    n_samples, n_features = all_embeddings.shape
    n_components = min(3, n_samples, n_features)

    # Choose dimensionality reduction method: PCA or t-SNE
    method = method.lower()  # Make it case-insensitive
    if method == 'pca':
        reducer = PCA(n_components=n_components)
    elif method == 'tsne':
        reducer = TSNE(n_components=n_components, random_state=42)
    else:
        raise ValueError(f"Invalid method '{method}'. Please choose either 'pca' or 'tsne'.")

    # Perform dimensionality reduction
    reduced_embeddings = reducer.fit_transform(all_embeddings)

    # Dynamically set column names (x, y, z), adjusting if dimensions are fewer than 3
    column_names = ['x', 'y', 'z'][:n_components]

    # Create DataFrame to save tokens and embeddings
    df = pd.DataFrame(reduced_embeddings, columns=column_names)
    df['token'] = all_tokens  # Store tokens in the DataFrame
    if len(df) != len(all_tokens):
        raise ValueError(f"Length of embeddings ({len(df)}) does not match the number of tokens ({len(all_tokens)}).")
    df.to_csv(output_csv_path, index=False)
    print(f"Latent space saved to {output_csv_path}")

# Define the folder path for TXT files and the output CSV path
folder_path = '/content/drive/My Drive/text_test'  #@param{type:'string'}
output_csv_path = '/content/drive/My Drive/token_latent_space.csv'  # Save to Google Drive
method = 'PCA'  #@param{type:'string'}['TSNE', 'PCA']
# Run the process and save results
process_tokens_to_csv(folder_path, output_csv_path, method)


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884

Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors


Latent space saved to /content/drive/My Drive/token_latent_space.csv


## Latent Space Preview

In [16]:
# @title
# Function to visualise the 3D latent space interactively using plotly
def visualise_latent_space_interactive(csv_path):
    # Read the CSV containing the tokens and 3D coordinates
    df = pd.read_csv(csv_path)

    # Check if 'z' column is available for 3D plot, otherwise do a 2D plot
    if 'z' in df.columns:
        # Create a 3D scatter plot using Plotly with smaller markers
        fig = px.scatter_3d(df, x='x', y='y', z='z', text='token', title="3D Latent Space Visualisation", opacity=0)
        fig.update_traces(marker=dict(size=0))  # Reduce marker size
    else:
        # Create a 2D scatter plot if only 'x' and 'y' are available with smaller markers
        fig = px.scatter(df, x='x', y='y', text='token', title="2D Latent Space Visualisation", opacity=0)
        fig.update_traces(marker=dict(size=0))  # Reduce marker size

    # Display the plot
    fig.show()

# Run the interactive visualisation
visualise_latent_space_interactive(output_csv_path)
