## **Load the Mistral-DNA git**

In [1]:
!git clone https://github.com/raphaelmourad/Mistral-DNA.git
!tar -xf Mistral-DNA/data/GUE.tar.xz -C Mistral-DNA/data/

Cloning into 'Mistral-DNA'...
remote: Enumerating objects: 331, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 331 (delta 27), reused 30 (delta 5), pack-reused 235 (from 1)[K
Receiving objects: 100% (331/331), 112.69 MiB | 18.63 MiB/s, done.
Resolving deltas: 100% (100/100), done.


## **Imports**

In [2]:
import torch
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [3]:
# Load basic modules
import os
import sys
import time
#import flash_attn   #cannont install and import
from os import path
import gc

# Load data and machine learning modules
import numpy as np
import pandas as pd
from random import randrange
from progressbar import ProgressBar

import torch
#import triton
import transformers
from torch.utils.data import TensorDataset, DataLoader
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from transformers import AutoTokenizer, AutoModel, EarlyStoppingCallback, set_seed, BitsAndBytesConfig
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
)

In [4]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [5]:
os.chdir("Mistral-DNA/")
!pwd

/content/Mistral-DNA


In [6]:
model_name="RaphaelMourad/Mistral-DNA-v1-17M-hg38"

In [7]:
sys.path.append("scriptPython/")
from functions import *

## **Configure trainning, bnb, fsdp, peft**

In [8]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [26]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
bnb_config

BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "fp4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

In [27]:
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [28]:
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.5,
        bias="none",
        task_type="SEQ_CLS",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )

In [36]:
training_args = transformers.TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=2e-5,  # Adjusted learning rate for binary classification
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.1,
    bf16=True, #FALSE IN GPU PERFORMS BETTER???
    report_to="none",
    load_best_model_at_end = True,
)

import os
os.environ["WANDB_DISABLED"] = "true"

In [37]:

# load tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=1000,
    padding_side="left",
    use_fast=True,
    trust_remote_code=True,
)
tokenizer.eos_token='[EOS]'
tokenizer.pad_token = '[PAD]'

## **Load the data from R**


In [49]:
num_labels=2 # make it binary classification beacause i dont know if it works with 3

# Load csv from R
df = pd.read_csv('/content/Mistral-DNA/rk.csv')

df = df.drop('Unnamed: 0', axis=1)
df

df['type'] = df['type'].map({'receptor': 0, 'kinase': 1})

df.value_counts('type')


#df = df[1500:1800]


print(int(df['coding'].str.len().max()))

'''
filtered_list = []

for index, row in df.iterrows():
  coding = row['coding']
  if len(coding) >= 100 and len(coding) <= 500:
    filtered_list.append(row.to_dict())



df1 = pd.DataFrame(filtered_list)
df1.head()
print(df1.value_counts(df1['type']))

 '''
'''
df = df.drop('descriprion', axis=1)
df = df.drop('peptide', axis=1) '''

df = df[:-1000]
print(df.value_counts(df['type']))

18921
type
1    3688
0    3449
Name: count, dtype: int64


In [50]:
from sklearn.model_selection import train_test_split

## Defining all these so that I can use the Supervised dataset to pass the
# split data through a tokenizer

# Split the dataframe into training, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

import os

# Define temporary file paths
temp_train_path = "temp_train.csv"
temp_val_path = "temp_val.csv"
temp_test_path = "temp_test.csv"

# Save DataFrames to CSV files
train_df.to_csv(temp_train_path, index=False)
val_df.to_csv(temp_val_path, index=False)
test_df.to_csv(temp_test_path, index=False)

print(f"Training data saved to: {temp_train_path}")
print(f"Validation data saved to: {temp_val_path}")
print(f"Test data saved to: {temp_test_path}")

# Define datasets using the temporary file paths
train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=temp_train_path, kmer=-1)
val_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=temp_val_path, kmer=-1)
test_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=temp_test_path, kmer=-1)

data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

print("Training dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("Test dataset size:", len(test_dataset))



Training data saved to: temp_train.csv
Validation data saved to: temp_val.csv
Test data saved to: temp_test.csv




Training dataset size: 5709
Validation dataset size: 714
Test dataset size: 714


In [51]:
# load model
model=transformers.AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    output_hidden_states=False,
    #quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of MixtralForSequenceClassification were not initialized from the model checkpoint at RaphaelMourad/Mistral-DNA-v1-17M-hg38 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
trainer = transformers.Trainer(model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
  )


trainer.local_rank=training_args.local_rank
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Matthews Correlation,Precision,Recall
1,0.5943,0.558595,0.703081,0.702848,0.406403,0.702946,0.703457
2,0.4751,0.451839,0.780112,0.77383,0.569939,0.79673,0.773676
3,0.3184,0.401969,0.816527,0.814897,0.63292,0.819077,0.813864
4,0.2315,0.462482,0.815126,0.81512,0.633522,0.816602,0.81692
5,0.1804,0.500932,0.857143,0.856127,0.714216,0.859148,0.85508
6,0.1672,0.592427,0.852941,0.852187,0.705134,0.853632,0.851504


TrainOutput(global_step=4284, training_loss=0.34451972940166237, metrics={'train_runtime': 2017.6904, 'train_samples_per_second': 56.589, 'train_steps_per_second': 7.077, 'total_flos': 3021472447488000.0, 'train_loss': 0.34451972940166237, 'epoch': 6.0})

In [53]:
results_path = training_args.output_dir+"/metrics"
results = trainer.evaluate(eval_dataset=test_dataset)
os.makedirs(results_path, exist_ok=True)
with open(os.path.join(results_path, "test_results.json"), "w") as f:
    json.dump(results, f)

file_metric="results/metrics/test_results.json"
data_expe = pd.read_json(file_metric, typ='series')
print(data_expe)


eval_loss                     0.412643
eval_accuracy                 0.815126
eval_f1                       0.812141
eval_matthews_correlation     0.630654
eval_precision                0.820364
eval_recall                   0.810369
eval_runtime                 16.814000
eval_samples_per_second      42.465000
eval_steps_per_second         5.353000
epoch                         6.000000
dtype: float64


In [43]:
def classify_coding_sequences(sequence):

  # Check if there is input
  if not isinstance(sequence, str) or not sequence:
    return "Invalid input: Please provide a non-empty DNA sequence."

  # Check if length of input is valid
  if len(sequence) > tokenizer.model_max_length:
    return f'Input sequence is too long. Maximum length is {tokenizer.model_max_length}.'


  # Tokenize the input seq
  inputs = tokenizer(sequence, return_tensors='pt', padding=True, truncation=True)


  if 'token_type_ids' in inputs:
    del inputs['token_type_ids']


  # Make sure variables correspond to the device
  device = model.device
  inputs = {key: value.to(device) for key, value in inputs.items()}

  # Use the model to make a prediction
  with torch.no_grad():
    outputs = model(**inputs)

  # Get the output score from the model and use the highest class score
  logits = outputs.logits
  predicted_class_id = torch.argmax(logits, dim=1).item()


  class_labels = ['receptor', 'kinase']
  if predicted_class_id < len(class_labels):
    return class_labels[predicted_class_id]
  else:
    return f'Unknown class ID: {predicted_class_id}'


In [None]:
classify_coding_sequences('GTCAGAATGGCGGCAGCGGAGCATCGTCATTCTTCAGGATTGCCCTGCTGGCCCTACCTCACAGCTGAAGCTTTAAAAAACAGGATGGGCCGCCAGCCACCTCCTCCAACTCAACAACATTCTATAACTGATAACTCCCTGAGCCTCAAGACACCTCCCGAATGTCTCCTTCATCCCCTTCCACCCTCAGTGGATGATAATATCAAGGAGTGTCCTCTTGCTCCTCTTCCACCCTCAGTGGATGATAATCTGAAGGAGTATCTCCTGG')

'receptor'

In [None]:
#@title Convert ipynb to HTML in Colab
# Upload ipynb
from google.colab import files
f = files.upload()

# Convert ipynb to html
import subprocess
file0 = list(f.keys())[0]
_ = subprocess.run(["pip", "install", "nbconvert"])
_ = subprocess.run(["jupyter", "nbconvert", file0, "--to", "html"])

# download the html
files.download(file0[:-5]+"html")