## **Load the Mistral-DNA git**

In [1]:
!git clone https://github.com/raphaelmourad/Mistral-DNA.git
!tar -xf Mistral-DNA/data/GUE.tar.xz -C Mistral-DNA/data/

Cloning into 'Mistral-DNA'...
remote: Enumerating objects: 331, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 331 (delta 27), reused 30 (delta 5), pack-reused 235 (from 1)[K
Receiving objects: 100% (331/331), 112.69 MiB | 19.82 MiB/s, done.
Resolving deltas: 100% (100/100), done.


In [4]:
!pip install torch



In [13]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.1


## **Imports**

In [6]:
import torch
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [7]:
# Load basic modules
import os
import sys
import time
from os import path
import gc

# Load data and machine learning modules
import numpy as np
import pandas as pd
from random import randrange
from progressbar import ProgressBar

import torch
#import triton
import transformers
from torch.utils.data import TensorDataset, DataLoader
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, set_seed, BitsAndBytesConfig
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
)

In [8]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [9]:
os.chdir("Mistral-DNA/")
!pwd

/content/Mistral-DNA


In [10]:
model_name="RaphaelMourad/Mistral-DNA-v1-17M-hg38"

In [11]:
sys.path.append("scriptPython/")
from functions import *

## **Configure trainning, bnb, fsdp, peft**

In [112]:
training_args = transformers.TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,  # Adjusted learning rate for binary classification
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
    bf16=True,
    report_to="none",
    load_best_model_at_end = True,
)

import os
os.environ["WANDB_DISABLED"] = "true"

In [113]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
bnb_config

BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "fp4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [114]:
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [115]:
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )

## **Load the data from R**


In [116]:
num_labels=2 # make it binary classification beacause i dont know if it works with 3

# Load csv from R

df = pd.read_csv('/content/Mistral-DNA/protein_data.csv')

df = df.drop('Unnamed: 0', axis=1)
df

df['type'] = df['type'].map({'kinase': 2, 'nuclear': 0, 'membrane': 1})

df.value_counts('type')

df = df[396:776]
df

Unnamed: 0,coding,type
396,ATGTCTTCTAATTCAGATACTGGGGATTTACAAGAGTCTTTAAAGC...,0
397,ATGAGCACCAGCCAACCAGGGGCCTGCCCATGCCAGGGAGCTGCAA...,0
398,ATGATAGAACAGCAGAAGCGTAAGGGCCCAGAGTTGCCGCTGGTTC...,0
399,ATGGCGGTGAGCCATTCAGTGAAGGAGCGGACCATCTCTGAGAACA...,0
400,ATGAGTTCCTCGCCTGTTAATGTAAAAAAGCTGAAGGTGTCGGAGC...,0
...,...,...
771,ATGGCGGCTGAGTGGGCTTCTCGTTTCTGGCTTTGGGCTACGCTGC...,1
772,ATGTCCTCCCCACAACTTCCAGCTTTCTTATGGGACAAGGGTACAC...,1
773,ATGACGAACGTGTACTCCTTGGATGGGATTCTGGTGTTTGGTTTGC...,1
774,ATGCCACCACCAGCCTATGAGCCTCCAGCCCCTGCCCCATTGCCTC...,1


In [126]:
# load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=1000,
    padding_side="left",
    use_fast=True,
    trust_remote_code=True,
)
tokenizer.eos_token='[EOS]'
tokenizer.pad_token = '[PAD]'

In [127]:
from sklearn.model_selection import train_test_split

# Split the dataframe into training, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

import os

# Define temporary file paths
temp_train_path = "temp_train.csv"
temp_val_path = "temp_val.csv"
temp_test_path = "temp_test.csv"

# Save DataFrames to CSV files
train_df.to_csv(temp_train_path, index=False)
val_df.to_csv(temp_val_path, index=False)
test_df.to_csv(temp_test_path, index=False)

print(f"Training data saved to: {temp_train_path}")
print(f"Validation data saved to: {temp_val_path}")
print(f"Test data saved to: {temp_test_path}")

# Define datasets using the temporary file paths
train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=temp_train_path, kmer=-1)
val_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=temp_val_path, kmer=-1)
test_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=temp_test_path, kmer=-1)

data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

print("Training dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("Test dataset size:", len(test_dataset))



Training data saved to: temp_train.csv
Validation data saved to: temp_val.csv
Test data saved to: temp_test.csv




Training dataset size: 266
Validation dataset size: 57
Test dataset size: 57


In [128]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np


model.config.pad_token_id = tokenizer.pad_token_id

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['type']),
    y=train_df['type']
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(model.device)

trainer = transformers.Trainer(model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )


trainer.local_rank=training_args.local_rank
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Matthews Correlation,Precision,Recall
1,No log,0.261434,0.894737,0.894704,0.80947,0.90625,0.903226
2,No log,0.223592,0.912281,0.911847,0.82426,0.911111,0.913151
3,No log,0.226024,0.929825,0.92963,0.861563,0.929187,0.932382
4,No log,0.22271,0.912281,0.911847,0.82426,0.911111,0.913151
5,No log,0.221131,0.929825,0.92963,0.861563,0.929187,0.932382
6,No log,0.204058,0.912281,0.911847,0.82426,0.911111,0.913151
7,No log,0.210466,0.929825,0.92963,0.861563,0.929187,0.932382
8,No log,0.213571,0.912281,0.911847,0.82426,0.911111,0.913151
9,No log,0.20531,0.912281,0.911847,0.82426,0.911111,0.913151


TrainOutput(global_step=153, training_loss=0.030486664740867864, metrics={'train_runtime': 138.1151, 'train_samples_per_second': 38.519, 'train_steps_per_second': 2.462, 'total_flos': 211169645568000.0, 'train_loss': 0.030486664740867864, 'epoch': 9.0})

In [129]:
results_path = training_args.output_dir+"/metrics"
results = trainer.evaluate(eval_dataset=test_dataset)
os.makedirs(results_path, exist_ok=True)
with open(os.path.join(results_path, "test_results.json"), "w") as f:
    json.dump(results, f)

file_metric="results/metrics/test_results.json"
data_expe = pd.read_json(file_metric, typ='series')
print(data_expe)


eval_loss                     0.291745
eval_accuracy                 0.859649
eval_f1                       0.856061
eval_matthews_correlation     0.721000
eval_precision                0.868831
eval_recall                   0.852357
eval_runtime                  1.270300
eval_samples_per_second      44.871000
eval_steps_per_second         3.149000
epoch                         9.000000
dtype: float64


In [141]:
def classify_coding_sequences(sequence):

  # Check if there is input
  if not isinstance(sequence, str) or not sequence:
    return "Invalid input: Please provide a non-empty DNA sequence."

  # Check if length of input is valid
  if len(sequence) > tokenizer.model_max_length:
    return f'Input sequence is too long. Maximum length is {tokenizer.model_max_length}.'


  # Tokenize the input seq
  inputs = tokenizer(sequence, return_tensors='pt', padding=True, truncation=True)


  if 'token_type_ids' in inputs:
    del inputs['token_type_ids']


  # Make sure variables correspond to the device
  device = model.device
  inputs = {key: value.to(device) for key, value in inputs.items()}

  # Use the model to make a prediction
  with torch.no_grad():
    outputs = model(**inputs)

  # Get the output score from the model and use the highest class score
  logits = outputs.logits
  predicted_class_id = torch.argmax(logits, dim=1).item()


  class_labels = ['nuclear', 'membrane']
  if predicted_class_id < len(class_labels):
    return class_labels[predicted_class_id]
  else:
    return f'Unknown class ID: {predicted_class_id}'


In [142]:
classify_coding_sequences('GTCAGAATGGCGGCAGCGGAGCATCGTCATTCTTCAGGATTGCCCTGCTGGCCCTACCTCACAGCTGAAGCTTTAAAAAACAGGATGGGCCGCCAGCCACCTCCTCCAACTCAACAACATTCTATAACTGATAACTCCCTGAGCCTCAAGACACCTCCCGAATGTCTCCTTCATCCCCTTCCACCCTCAGTGGATGATAATATCAAGGAGTGTCCTCTTGCTCCTCTTCCACCCTCAGTGGATGATAATCTGAAGGAGTATCTCCTGG')

'membrane'