<a href="https://colab.research.google.com/github/WilliamShengYangHuang/AALU_Workshop_3/blob/main/Text_Embeddings_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
# @title
# Required installations
# !pip install transformers scikit-learn pandas numpy torch plotly

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Enter the text folder path below, then select a dimensionality reduction method for data projection.



In [21]:
# Load pre-trained model and tokenizer (e.g., 'bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Function to extract embeddings for an entire text
def get_cell_embedding(text, model, tokenizer, max_length=512):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    # Return the embedding of the [CLS] token as the representation of the entire text
    embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # Shape: (embedding_dim,)
    return embedding

# Read text content from the second row of a CSV file
def read_text_from_csv(file_path):

    try:
        df = pd.read_csv(file_path, header=0, encoding='utf-8')
    except UnicodeDecodeError:

        df = pd.read_csv(file_path, header=0, encoding='latin1')
    # Flatten the CSV content into a list of strings (skip the header row)
    texts = df.iloc[1:].values.flatten().tolist()
    return texts

# Main process: Convert each cell into embeddings and save to CSV
def process_csv_cells_to_embeddings(input_csv_path, output_csv_path, method='pca'):
    # Read text content from the CSV
    texts = read_text_from_csv(input_csv_path)

    all_embeddings = []

    # Get embeddings for all text cells in the CSV
    for text in texts:
        embedding = get_cell_embedding(text, model, tokenizer)
        all_embeddings.append(embedding)

    all_embeddings = np.array(all_embeddings)

    # Check for invalid embeddings
    if np.isnan(all_embeddings).any():
        raise ValueError("Embeddings contain NaN values. Check the input data.")

    # Adjust n_components based on the data
    n_samples, n_features = all_embeddings.shape
    n_components = min(3, n_samples, n_features)
    if n_components < 2:
        print("Reducing to 2D as data is insufficient for 3D.")
        n_components = 2

    # Choose dimensionality reduction method: PCA or t-SNE
    method = method.lower()
    if method == 'pca':
        reducer = PCA(n_components=n_components)
    elif method == 'tsne':
        # Dynamically adjust perplexity to ensure it is less than n_samples
        perplexity = min(30, max(5, n_samples // 3))
        reducer = TSNE(n_components=n_components, random_state=42, perplexity=perplexity)
        print(f"Using t-SNE with perplexity={perplexity}.")
    else:
        raise ValueError(f"Invalid method '{method}'. Please choose either 'pca' or 'tsne'.")

    # Perform dimensionality reduction
    reduced_embeddings = reducer.fit_transform(all_embeddings)

    # Dynamically set column names (x, y, z), adjusting if dimensions are fewer than 3
    column_names = ['x', 'y', 'z'][:n_components]

    # Create DataFrame to save embeddings
    df = pd.DataFrame(reduced_embeddings, columns=column_names)
    df['text'] = texts  # Include the original text for reference
    df.to_csv(output_csv_path, index=False)
    print(f"Latent space saved to {output_csv_path}")

# Function to visualise the 3D latent space interactively using plotly
def visualise_latent_space_interactive(csv_path):
    # Read the CSV containing the text and 3D coordinates
    df = pd.read_csv(csv_path)

    # Check if 'z' column is available for 3D plot, otherwise do a 2D plot
    if 'z' in df.columns:
        # Create a 3D scatter plot using Plotly
        fig = px.scatter_3d(df, x='x', y='y', z='z', text='text', title="3D Latent Space Visualisation")
        fig.update_traces(marker=dict(size=3))  # Adjust marker size for better visibility
    elif 'y' in df.columns:
        # Create a 2D scatter plot
        fig = px.scatter(df, x='x', y='y', text='text', title="2D Latent Space Visualisation")
        fig.update_traces(marker=dict(size=3))  # Adjust marker size for better visibility
    else:
        raise ValueError("Insufficient dimensions for visualisation. Ensure data has at least 2D.")

    # Display the plot
    fig.show()

# Define the input and output CSV paths
input_csv_path = '/content/drive/My Drive/test_text.csv'  #@param{type:'string'}
output_csv_path = '/content/drive/My Drive/cell_latent_space.csv'   #@param{type:'string'}
method = 'TSNE'  #@param{type:'string'}['PCA', 'TSNE']

# Run the process and save results
process_csv_cells_to_embeddings(input_csv_path, output_csv_path, method)

# Run the interactive visualisation
visualise_latent_space_interactive(output_csv_path)


Using t-SNE with perplexity=9.
Latent space saved to /content/drive/My Drive/cell_latent_space.csv
