This notebook is designed to generate and process embeddings of protein sequences using the ProtT5 transformer model. The goal is to perform spatial analysis on these sequences using Principal Component Analysis (PCA) to reduce their dimensionality. The process includes downloading sequence data from a CSV file, preprocessing the sequences into corrected amino acid sequences, converting them into numerical embeddings using the ProtT5 transformer model, and finally applying PCA to produce a compressed and more accurate representation of the data. The result is saved in a new CSV file.

In [None]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5EncoderModel
import re
import numpy as np
from sklearn.decomposition import PCA

In [None]:
def download_data(url):
    return pd.read_csv(url, delimiter=',')

def setup_model():
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("Using device: {}".format(device))

    transformer_link = "Rostlab/prot_t5_xl_half_uniref50-enc"
    print("Loading: {}".format(transformer_link))
    model = T5EncoderModel.from_pretrained(transformer_link)
    model.full() if device == 'cpu' else model.half()  # only cast to full-precision if no GPU is available
    model = model.to(device)
    model = model.eval()
    tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False)
    return model, tokenizer, device

def preprocess_sequences(sequence_examples, tokenizer):
    sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]
    return sequence_examples

def generate_embeddings(sequence_examples, tokenizer, model, device, batch_size=10):
    embeddings_list = []
    max_length = 0

    # Find the maximum sequence length
    for sequence in sequence_examples:
        tokens = tokenizer.encode(sequence, add_special_tokens=True)
        if len(tokens) > max_length:
            max_length = len(tokens)

    for i in range(0, len(sequence_examples), batch_size):
        batch_sequences = sequence_examples[i:i+batch_size]

        # Tokenize sequences and pad to the maximum length
        ids = tokenizer.batch_encode_plus(
            batch_sequences,
            add_special_tokens=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        input_ids = ids['input_ids'].to(device)
        attention_mask = ids['attention_mask'].to(device)

        # Generate embeddings
        with torch.no_grad():
            embedding_repr = model(input_ids=input_ids, attention_mask=attention_mask)

        embeddings_list.append(embedding_repr.last_hidden_state.cpu())

        # Clear GPU memory
        del input_ids
        del attention_mask
        torch.cuda.empty_cache()

    return torch.cat(embeddings_list)

def perform_pca(embeddings, n_components=20):
    # Flatten embeddings
    flattened_embeddings = embeddings.view(embeddings.size(0), -1).numpy()

    # Perform PCA
    pca = PCA(n_components=n_components)
    pca_embeddings = pca.fit_transform(flattened_embeddings)

    return pd.DataFrame(pca_embeddings)

def save_embeddings(embeddings, output_file):
    embeddings.to_csv(output_file, index=False)
    print(f"PCA transformed embeddings saved to {output_file} successfully.")

def main():
    url = input("Please enter the URL of your CSV file: ")
    output_file = input("Please enter the name of the output CSV file: ")
    data = download_data(url)
    model, tokenizer, device = setup_model()
    sequence_examples = preprocess_sequences(data['seq'].tolist(), tokenizer)
    embeddings = generate_embeddings(sequence_examples, tokenizer, model, device)
    pca_embeddings = perform_pca(embeddings)
    save_embeddings(pca_embeddings, output_file)

if __name__ == "__main__":
    main()
