In [1]:
import torch
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
import random
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Model and Tokenizer 
tokenizer = AutoTokenizer.from_pretrained("Phi-3-mini-4k-instruct")
model = AutoModelForCausalLM.from_pretrained(
    "Phi-3-mini-4k-instruct",
    # device_map="cpu",
    torch_dtype=torch.float16,  # Mixed precision for potential speed-up
    trust_remote_code=True,
).to("cpu")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:04<00:00,  2.26s/it]


In [3]:
# List of Real Gene and Protein Names
genes = ["TP53", "BRCA1", "EGFR", "KRAS", "MYC", "BCL2", "PTEN"]
proteins = ["Albumin", "IgG", "Transferrin", "Haptoglobin", "Alpha-1-antitrypsin"]

def generate_expression_data(num_samples):
    data = []
    for _ in range(num_samples):
        gene_expr = {gene: round(random.uniform(5, 30), 2) for gene in genes}
        protein_expr = {protein: round(random.uniform(10, 100), 2) for protein in random.sample(proteins, k=3)}  
        data.append((gene_expr, protein_expr))
    return data

# Generate 1000 samples
gene_value_pairs = generate_expression_data(1000)

# Convert to DataFrame for easier handling
df = pd.DataFrame(gene_value_pairs, columns=["Gene Expression", "Protein Expression"])
df.to_csv("gene_protein_expression2.csv", index=False)

# Load Dataset from CSV
from datasets import load_dataset
dataset = load_dataset("csv", data_files="gene_protein_expression2.csv")

Generating train split: 1000 examples [00:00, 27285.53 examples/s]


In [4]:
# Custom Data Collator for MLM (Modified)
class GeneProteinDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
    def __call__(self, examples):
        # Convert dictionary to string representation
        examples = [
            f"Gene expression data = ({', '.join([f'{gene}: {value}' for gene, value in ex['Gene Expression'].items()])}) "
            f"Protein expression data = ({', '.join([f'{protein}: {value}' for protein, value in ex['Protein Expression'].items()])})"
            for ex in examples
        ]
        return super().__call__(examples, return_tensors="pt", padding=True, truncation=True, max_length=512)


In [5]:
# Prepare Tokenized Dataset
tokenized_dataset = dataset.map(
    lambda example: tokenizer(example["Gene Expression"], example["Protein Expression"], truncation=True, max_length=2048),
    batched=True,
    remove_columns=dataset["train"].column_names
)

# Set Up Trainer
# data_collator = GeneProteinDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)  mlm=False
data_collator = GeneProteinDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map: 100%|████████████████████████| 1000/1000 [00:00<00:00, 16719.03 examples/s]


In [6]:
training_args = TrainingArguments(
    output_dir="./phi-3-mini-omics",
    evaluation_strategy="epoch",
    learning_rate=2e-5,   
    per_device_train_batch_size=1,   # Reduced batch size
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    # fp16=True,\
    fp16=False,  
    gradient_checkpointing=False,   # Enable gradient checkpointing
    save_strategy = "epoch",
    logging_dir = "./logs",
    push_to_hub=False,
)



In [7]:
# from accelerate import Accelerator
# accelerator = Accelerator()

# trainer = Trainer(
#     model=accelerator.prepare(model),  # Prepare model with accelerator
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     data_collator=data_collator,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     # eval_dataset=tokenized_dataset["validation"],
#     data_collator=data_collator)

In [8]:
# Set Up Trainer
trainer = Trainer(
    model=model,  # No need for accelerator.prepare as we are on CPU
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
)

# Train!
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 

In [None]:















trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator
)


# Train!
trainer.train()



# Tokenize, Get Embeddings, and Create Sentences
all_tokens = []
all_embeddings = []
sentences = []

for genes, proteins in gene_value_pairs:
    gene_str = ", ".join([f"{gene}: {value}" for gene, value in genes.items()])
    protein_str = ", ".join([f"{protein}: {value}" for protein, value in proteins.items()])
    input_sentence = f"Gene expression data = ({gene_str})"
    target_sentence = f"Protein expression data = ({protein_str})"
    sentences.append(input_sentence)
    sentences.append(target_sentence)

    # Get Sentence Embeddings (using average pooling)
    for sentence in [input_sentence, target_sentence]:
        input_ids = tokenizer.encode(sentence, return_tensors="pt")
        with torch.no_grad():
            outputs = model(input_ids, output_hidden_states=True)
        token_embeddings = outputs.hidden_states[-1].squeeze(0)
        sentence_embedding = torch.mean(token_embeddings, dim=0).numpy()
        all_embeddings.append(sentence_embedding)

    # Get Token Embeddings (if needed)
    all_tokens.extend(tokenizer.tokenize(input_sentence))
    all_tokens.extend(tokenizer.tokenize(target_sentence))

    # Correctly get token ids for both input and target sentences
    input_ids = tokenizer.encode(input_sentence, return_tensors="pt")
    target_ids = tokenizer.encode(target_sentence, return_tensors="pt")
    for tokens in [input_ids, target_ids]:
        for token_id in tokens[0]:
            with torch.no_grad():
                input_ids = torch.tensor([[token_id]])  # Wrap token_id in a list and convert to tensor
                outputs = model(input_ids, output_hidden_states=True)
            embedding = outputs.hidden_states[-1].squeeze().numpy()
            all_embeddings.append(embedding)


# Convert to NumPy Array
all_embeddings = np.array(all_embeddings)

# Reduce Embeddings to 2D
tsne = TSNE(n_components=2, random_state=0, perplexity=30)
embeddings_2d = tsne.fit_transform(all_embeddings)

# Plot
plt.figure(figsize=(12, 10))

for i, (token, embedding) in enumerate(zip(all_tokens + sentences, embeddings_2d)):
    plt.scatter(embedding[0], embedding[1])
    plt.annotate(token, (embedding[0], embedding[1]), fontsize=8)

plt.title("t-SNE Visualization of Gene-Value, Protein Expression, and Sentence Embeddings")
plt.show()


In [None]:
# Tokenize, Get Embeddings, and Create Sentences
all_tokens = []
all_embeddings = []
sentences = []

for genes, proteins in gene_value_pairs:
    gene_str = ", ".join([f"{gene}: {value}" for gene, value in genes.items()])
    protein_str = ", ".join([f"{protein}: {value}" for protein, value in proteins.items()])
    input_sentence = f"Gene expression data = ({gene_str})"
    target_sentence = f"Protein expression data = ({protein_str})"
    sentences.append(input_sentence)
    sentences.append(target_sentence)

    # Get Sentence Embeddings (using average pooling)
    for sentence in [input_sentence, target_sentence]:
        input_ids = tokenizer.encode(sentence, return_tensors="pt")
        with torch.no_grad():
            outputs = model(input_ids, output_hidden_states=True)
        token_embeddings = outputs.hidden_states[-1].squeeze(0)
        sentence_embedding = torch.mean(token_embeddings, dim=0).numpy()
        all_embeddings.append(sentence_embedding)

    # Get Token Embeddings (if needed)
    all_tokens.extend(tokenizer.tokenize(input_sentence))
    all_tokens.extend(tokenizer.tokenize(target_sentence))

    # Correctly get token ids for both input and target sentences
    input_ids = tokenizer.encode(input_sentence, return_tensors="pt")
    target_ids = tokenizer.encode(target_sentence, return_tensors="pt")
    for tokens in [input_ids, target_ids]:
        for token_id in tokens[0]:
            with torch.no_grad():
                input_ids = torch.tensor([[token_id]])  # Wrap token_id in a list and convert to tensor
                outputs = model(input_ids, output_hidden_states=True)
            embedding = outputs.hidden_states[-1].squeeze().numpy()
            all_embeddings.append(embedding)


# Convert to NumPy Array
all_embeddings = np.array(all_embeddings)

# Reduce Embeddings to 2D
tsne = TSNE(n_components=2, random_state=0, perplexity=30)
embeddings_2d = tsne.fit_transform(all_embeddings)

# Plot
plt.figure(figsize=(12, 10))

for i, (token, embedding) in enumerate(zip(all_tokens + sentences, embeddings_2d)):
    plt.scatter(embedding[0], embedding[1])
    plt.annotate(token, (embedding[0], embedding[1]), fontsize=8)

plt.title("t-SNE Visualization of Gene-Value, Protein Expression, and Sentence Embeddings")
plt.show()