Before running the next block, set up a python3 virtual environemnt. 

First, make sure you're using the correct version of python. In the terminal, type:

>module load python3/3.10.5

Then, create your virtual environment:

>python -m venv .venv

then 

>source .venv/bin/activate

then install the requirements:

>pip install -r requirements.txt

You are now free to run the rest of the code:

In [1]:
#for tokenizer
from sklearn.metrics import mean_squared_error, mean_absolute_error , r2_score
import morfessor
import math
import tokenizers 
from tokenizers import Tokenizer, trainers, BertWordPieceTokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import CharDelimiterSplit
import os

#for BERT
from transformers import AutoTokenizer,AutoModelForSequenceClassification,TrainingArguments, Trainer, BigBirdTokenizerFast, AutoConfig, BertConfig, BertForSequenceClassification
import torch
from datasets import load_dataset
import numpy


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
if torch.cuda.is_available():
    print('total gpu:',torch.cuda.get_device_properties(0).total_memory / (1024 ** 3), "GB")
    print('Allocated mem:', torch.cuda.memory_allocated(0) / (1024 ** 3), "GB")
    print('Cache:',  torch.cuda.memory_reserved(0) / (1024 ** 3), "GB")

# Repression Prediction via LLM
Below are the modules needed to train the LMM. 

In [12]:
def train_tokenizer_BPE():
   # We build our custom tokenizer:
    tokenizer = Tokenizer(BPE()) 
    tokenizer.normalizer = Lowercase()
    tokenizer.pre_tokenizer = CharDelimiterSplit(' ')

    # We can train this tokenizer by giving it a list of path to text files:
    trainer = trainers.BpeTrainer(special_tokens=["[UNK]", "<s>", "</s>"], show_progress=True)
    tokenizer.train(files=['data/token_data_sliding_window.txt'], trainer=trainer)

    tokenizer.post_processor = tokenizers.processors.BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=512)

    tokenizer.save('./tok/tokenizer.json')

In [None]:
def train_tokenizer_wordpiece():
     # We build our custom tokenizer:
    tokenizer = BertWordPieceTokenizer()
    tokenizer.pre_tokenizer = CharDelimiterSplit(' ')

    # We can train this tokenizer by giving it a list of path to text files:
    #trainer = trainers.BpeTrainer()
    tokenizer.train(files=['data/token_data_sliding_window.txt'], show_progress=True, initial_alphabet = ['A','G','C','T'])

    tokenizer.enable_truncation(max_length=512)

    tokenizer.save('./tok/tokenizer.json')

In [13]:
train_tokenizer_wordpiece()

AttributeError: 'tokenizers.models.BPE' object has no attribute 'pre_tokenizer'

In [4]:
#prepare the data
def get_tokenized_dataset(tokenizer):
    ds = load_dataset('csv', data_files='data_dmitri/seq_er_sliding_window.csv')
    train_testvalid = ds['train'].train_test_split(test_size=0.2)
    # Split the 10% test + valid in half test, half valid
    test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
    # gather to a single DatasetDict
    ds = {
        'train': train_testvalid['train'],
        'test': test_valid['test'],
        'valid': test_valid['train']}

    def preprocess_function(examples):
        label = examples["Expression Rate"] 
        examples = tokenizer(examples["Token"], truncation=True, padding="max_length", max_length=512)
        examples["label"] = float(label)
        return examples

    for split in ds:
        ds[split] = ds[split].map(preprocess_function, remove_columns=["Token", "Expression Rate"])
    return ds

In [5]:
#Choose your tokenizer!

#autotokenizer from DNABERT
tokenizer_dna = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)

#self-made morphessor tokenizer
tokenizer_morph = BigBirdTokenizerFast.from_pretrained("tok", max_len=512)

ds = get_tokenized_dataset(tokenizer_morph)
print(ds)

Generating train split: 89997 examples [00:00, 207946.10 examples/s]
Map: 100%|██████████| 71997/71997 [00:19<00:00, 3679.81 examples/s]
Map: 100%|██████████| 9000/9000 [00:02<00:00, 3616.67 examples/s]
Map: 100%|██████████| 9000/9000 [00:02<00:00, 3672.93 examples/s]

{'train': Dataset({
    features: ['input_ids', 'attention_mask', 'label'],
    num_rows: 71997
}), 'test': Dataset({
    features: ['input_ids', 'attention_mask', 'label'],
    num_rows: 9000
}), 'valid': Dataset({
    features: ['input_ids', 'attention_mask', 'label'],
    num_rows: 9000
})}





In [7]:
LEARNING_RATE = 1e-6
MAX_LENGTH = 256
BATCH_SIZE = 8
EPOCHS = 20

#metrics
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred

    #when training DNABERT you have to use this line
    #logits = logits[0]

    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    
    return {"mse": mse, "mae": mae, "r2": r2}

training_args = TrainingArguments(
    output_dir="models/DNABERT_MORPH",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=5,
    load_best_model_at_end=True,
    weight_decay=0.01,
    dataloader_num_workers = 5,
    metric_for_best_model = "mse",
    logging_steps = 10,
)



In [8]:
#Define the model
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [11]:
# #Train the Model
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
# model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNABERT-2-117M",  config=config, trust_remote_code=True)

config = BertConfig(num_labels=1)
model = BertForSequenceClassification(config)
model.config.num_labels = 1
#load latest checkpoint
#model.load_state_dict(torch.load("models/DNABERT_MORPH/checkpoint-22500/pytorch_model.bin"))


trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["valid"],
    compute_metrics=compute_metrics_for_regression
)
trainer.train()


                                                      
  0%|          | 1/180000 [12:13<476:38:45,  9.53s/it] 

{'loss': 0.2173, 'grad_norm': 12.646466255187988, 'learning_rate': 9.999444444444443e-07, 'epoch': 0.0}


                                                      
  0%|          | 1/180000 [12:45<476:38:45,  9.53s/it] 

{'loss': 0.1993, 'grad_norm': 12.586045265197754, 'learning_rate': 9.99888888888889e-07, 'epoch': 0.0}


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x15df32710>
Traceback (most recent call last):
  File "/Users/samuelwu/Desktop/2024/Fall/BU Med Lab/Sequence-Expression-LMM/.venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__
    self._shutdown_workers()
  File "/Users/samuelwu/Desktop/2024/Fall/BU Med Lab/Sequence-Expression-LMM/.venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1441, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/connection.py", line 936, in wait
    re

KeyboardInterrupt: 

In [None]:
#evaluate the data
trainer.eval_dataset=ds["test"]
trainer.evaluate()

In [None]:
trainer.save_model("final_model/")