In [52]:
import torch
import os
import json
from datasets import load_dataset, Dataset
# from datasets import Dataset, train_test_split
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from transformers.integrations import TensorBoardCallback
import random
import pandas as pd
from datasets import load_metric
from transformers import DataCollatorForLanguageModeling

In [2]:
# Settings
max_seq_length = 512
load_in_4bit = True  # Reduce memory usage

# Load Model and Tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="./unsloth/Phi-3-mini-4k-instruct",
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
    dtype=torch.float32,
    dtype=
    
)

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU. Max memory: 3.811 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████████████| 2/2 [00:06<00:00,  3.07s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [53]:
# Define the genes and proteins
genes = ["TP53", "BRCA1", "EGFR", "KRAS", "MYC", "BCL2", "PTEN"]
proteins = ["Albumin", "IgG", "Transferrin", "Haptoglobin", "Alpha-1-antitrypsin"]

# Define the function to generate expression data
def generate_expression_data(num_samples):
    data = []
    for _ in range(num_samples):
        gene_expr = {gene: round(random.uniform(5, 30), 2) for gene in genes}
        protein_expr = {protein: round(random.uniform(10, 100), 2) for protein in random.sample(proteins, k=3)}
        data.append((gene_expr, protein_expr))
    return data

# Generate expression data
num_samples = 1000
data = generate_expression_data(num_samples)

# Create a Dataset object
dataset = Dataset.from_dict({"gene_expression": gene_expression, "protein_expression": protein_expression})

# Separate gene and protein expression data
gene_expression = [d[0] for d in data]
protein_expression = [d[1] for d in data]

# Create DataFrames
gene_df = pd.DataFrame(gene_expression)
protein_df = pd.DataFrame(protein_expression).fillna(0)

def format_prompts(batch, tokenizer):
    texts = [
        tokenizer.apply_chat_template(
            [
                {"from": "human", "value": f"Given gene expression: {gene_expr}, predict the protein expression."},
                {"from": "gpt", "value": f"Protein expression: {protein_expr}{tokenizer.eos_token}"}
            ], 
            tokenize=False,
            add_generation_prompt=True
        )
        for gene_expr, protein_expr in zip(batch["gene_expression"], batch["protein_expression"])
    ]
    return {"text": texts}

# Map the formatting function to the dataset
dataset = dataset.map(lambda x: format_prompts(x, tokenizer), batched=True)



Map: 100%|████████████████████████| 1000/1000 [00:00<00:00, 29430.41 examples/s]


In [55]:
dataset

Dataset({
    features: ['gene_expression', 'protein_expression', 'text'],
    num_rows: 1000
})

In [54]:
dataset['gene_expression']

[{'BCL2': 19.76,
  'BRCA1': 8.56,
  'EGFR': 5.65,
  'KRAS': 16.34,
  'MYC': 27.88,
  'PTEN': 22.57,
  'TP53': 19.88},
 {'BCL2': 6.03,
  'BRCA1': 15.66,
  'EGFR': 26.07,
  'KRAS': 24.21,
  'MYC': 22.0,
  'PTEN': 27.69,
  'TP53': 6.84},
 {'BCL2': 25.56,
  'BRCA1': 5.54,
  'EGFR': 7.84,
  'KRAS': 12.98,
  'MYC': 19.21,
  'PTEN': 19.03,
  'TP53': 26.76},
 {'BCL2': 16.41,
  'BRCA1': 22.72,
  'EGFR': 22.56,
  'KRAS': 5.72,
  'MYC': 28.57,
  'PTEN': 5.12,
  'TP53': 21.05},
 {'BCL2': 15.51,
  'BRCA1': 10.69,
  'EGFR': 16.64,
  'KRAS': 28.09,
  'MYC': 27.91,
  'PTEN': 16.9,
  'TP53': 24.4},
 {'BCL2': 29.43,
  'BRCA1': 29.19,
  'EGFR': 12.34,
  'KRAS': 12.07,
  'MYC': 15.46,
  'PTEN': 20.11,
  'TP53': 11.93},
 {'BCL2': 14.29,
  'BRCA1': 24.83,
  'EGFR': 18.24,
  'KRAS': 21.92,
  'MYC': 15.97,
  'PTEN': 25.54,
  'TP53': 12.14},
 {'BCL2': 18.99,
  'BRCA1': 14.65,
  'EGFR': 22.04,
  'KRAS': 26.19,
  'MYC': 20.42,
  'PTEN': 14.08,
  'TP53': 11.05},
 {'BCL2': 11.49,
  'BRCA1': 7.7,
  'EGFR': 18.77,
 

In [43]:
# Create a new DataFrame with formatted chat data
chat_data = pd.DataFrame(format_prompts({"gene_expression": gene_df.values.tolist(), "protein_expression": protein_df.values.tolist()}, tokenizer))

# # Print the chat_data
# print(chat_data.head())

In [44]:
chat_data

Unnamed: 0,text
0,"<s><|user|>\nGiven gene expression: [6.51, 12...."
1,"<s><|user|>\nGiven gene expression: [19.11, 20..."
2,"<s><|user|>\nGiven gene expression: [18.93, 25..."
3,"<s><|user|>\nGiven gene expression: [23.86, 19..."
4,"<s><|user|>\nGiven gene expression: [9.43, 21...."
...,...
995,"<s><|user|>\nGiven gene expression: [18.32, 22..."
996,"<s><|user|>\nGiven gene expression: [11.05, 12..."
997,"<s><|user|>\nGiven gene expression: [13.05, 28..."
998,"<s><|user|>\nGiven gene expression: [26.76, 19..."


In [6]:
dataset = Dataset.from_pandas(df)

In [32]:
df = pd.DataFrame(gene_value_pairs, columns=["Gene Expression", "Protein Expression"])

In [33]:
df

Unnamed: 0,Gene Expression,Protein Expression
0,"{'TP53': 16.1, 'BRCA1': 14.71, 'EGFR': 14.09, ...","{'IgG': 27.51, 'Alpha-1-antitrypsin': 88.62, '..."
1,"{'TP53': 20.47, 'BRCA1': 27.51, 'EGFR': 18.71,...","{'IgG': 58.85, 'Haptoglobin': 50.71, 'Albumin'..."
2,"{'TP53': 28.89, 'BRCA1': 26.75, 'EGFR': 15.3, ...","{'Alpha-1-antitrypsin': 94.89, 'Haptoglobin': ..."
3,"{'TP53': 19.18, 'BRCA1': 9.31, 'EGFR': 29.59, ...","{'IgG': 38.48, 'Alpha-1-antitrypsin': 32.11, '..."
4,"{'TP53': 24.07, 'BRCA1': 17.17, 'EGFR': 20.26,...","{'Albumin': 11.93, 'Alpha-1-antitrypsin': 92.8..."
...,...,...
995,"{'TP53': 10.41, 'BRCA1': 13.19, 'EGFR': 8.67, ...","{'Transferrin': 63.56, 'IgG': 46.27, 'Alpha-1-..."
996,"{'TP53': 14.44, 'BRCA1': 23.47, 'EGFR': 11.12,...","{'Transferrin': 47.95, 'Haptoglobin': 52.91, '..."
997,"{'TP53': 27.55, 'BRCA1': 16.21, 'EGFR': 10.5, ...","{'Haptoglobin': 70.77, 'Alpha-1-antitrypsin': ..."
998,"{'TP53': 28.58, 'BRCA1': 16.11, 'EGFR': 12.83,...","{'Alpha-1-antitrypsin': 33.85, 'Haptoglobin': ..."


In [8]:
# Define the chat template
chat_template = get_chat_template(
    tokenizer,
    chat_template="phi-3",
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}
)

In [10]:
# Apply formatting and remove original columns
tokenized_dataset = dataset.map(
    lambda batch: format_prompts(batch, tokenizer), 
    batched=True,
    remove_columns=dataset.column_names  # Remove original columns after formatting
)

Map: 100%|████████████████████████| 1000/1000 [00:00<00:00, 29536.94 examples/s]


In [11]:

# DataCollator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [28]:
def preprocess_function(examples):
    examples["gene_expression"] = examples["Gene Expression"]
    examples["protein_expression"] = examples["Protein Expression"]
    return examples

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# tokenized_dataset = dataset.map(
#     preprocess_function, 
#     batched=True
# )

# # Apply formatting 
# tokenized_dataset = tokenized_dataset.map(
#     lambda batch: format_prompts(batch, tokenizer), 
#     batched=True,
#     remove_columns=dataset["train"].column_names  # Remove original columns *after* formatting
# )

# Define the chat template
chat_template = get_chat_template(
    tokenizer,
    chat_template="phi-3",
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}
)

def format_prompts(batch, tokenizer):
    texts = [
        tokenizer.apply_chat_template(
            [
                {"from": "human", "value": f"Given gene expression:  {gene_expr},  predict the protein expression. "},
                {"from": "gpt", "value": f" Protein expression: {protein_expr}"}
            ], 
            tokenize=False,
            add_generation_prompt=True
        )
        for gene_expr, protein_expr in zip(batch["gene_expression"], batch["protein_expression"])
    ]
    return {"text": texts}

# Apply formatting
tokenized_dataset = tokenized_dataset.map(lambda batch: format_prompts(batch, tokenizer), batched=True)

# DataCollator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


Map: 100%|███████████████████████| 1000/1000 [00:00<00:00, 252031.25 examples/s]
Map: 100%|████████████████████████| 1000/1000 [00:00<00:00, 59614.59 examples/s]


In [30]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['gene_expression', 'protein_expression', 'text'],
        num_rows: 1000
    })
})

In [13]:
tokenized_dataset

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [14]:
# Training Arguments
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=10,  # Adjust based on your dataset size
    learning_rate=2e-4,
    fp16=False,  # Disable fp16 for debugging
    bf16=False,  # Disable bf16 for debugging
    logging_steps=1,
    optim="adamw_hf",  # Use the standard Hugging Face optimizer
    output_dir="outputs",
    logging_dir='./logs',
    report_to="tensorboard",
)

In [15]:
# Train
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset,
    #eval_dataset=tokenized_dataset["validation"], # If you have a validation set, else remove this line.
    max_seq_length=max_seq_length,
    args=training_args,
    data_collator=data_collator,
    dataset_text_field="text",  # Specify the text field
    # formatting_func=format_prompts,  # Provide the formatting function
    packing=False
)

Map: 100%|████████████████████████| 1000/1000 [00:00<00:00, 22256.03 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [16]:
# Start training
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 10
 "-____-"     Number of trainable parameters = 29,884,416


RuntimeError: FlashAttention only support fp16 and bf16 data type

In [26]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-il40dq92/unsloth_4ae89ab4f1604923ae3f0aab6f4ef2f2
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-il40dq92/unsloth_4ae89ab4f1604923ae3f0aab6f4ef2f2
  Resolved https://github.com/unslothai/unsloth.git to commit e4c8ceacb3fca634f78e662873a01c37678fcb3e
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting transformers>=4.43.2 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
!pip install flash-atten