# Getting the Data

If running from google colab, uncomment and run this cell

In [2]:
'''
from google.colab import drive
drive.mount('/content/drive')
#cd to wherever you need
%cd /content/drive/MyDrive/VSCode/Genetic_Frame/Final_Proj
'''

Mounted at /content/drive


In [3]:
!wget "https://rest.uniprot.org/uniprotkb/stream?fields=accession,reviewed,id,protein_name,gene_names,organism_name,length,sequence,cc_function,cc_subcellular_location,cc_domain,protein_families&format=tsv&query=reviewed:true" -O uniprot_data.tsv

--2024-12-17 02:48:39--  https://rest.uniprot.org/uniprotkb/stream?fields=accession,reviewed,id,protein_name,gene_names,organism_name,length,sequence,cc_function,cc_subcellular_location,cc_domain,protein_families&format=tsv&query=reviewed:true
Resolving rest.uniprot.org (rest.uniprot.org)... 193.62.193.81
Connecting to rest.uniprot.org (rest.uniprot.org)|193.62.193.81|:443... connected.
HTTP request sent, awaiting response... 200 
Length: unspecified [text/plain]
Saving to: ‘uniprot_data.tsv’

uniprot_data.tsv        [   <=>              ] 530.54M  1.62MB/s    in 4m 27s  

2024-12-17 02:53:07 (1.99 MB/s) - ‘uniprot_data.tsv’ saved [556312950]



In [4]:
import pandas as pd
df = pd.read_csv('uniprot_data.tsv', sep = '\t')
df.head()

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Sequence,Function [CC],Subcellular location [CC],Domain [CC],Protein families
0,A0A009IHW8,reviewed,ABTIR_ACIB9,2' cyclic ADP-D-ribose synthase AbTIR (2'cADPR...,J512_3302,Acinetobacter baumannii (strain 1295743),269,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,FUNCTION: NAD(+) hydrolase (NADase) that catal...,,DOMAIN: The TIR domain mediates NAD(+) hydrola...,
1,A0A023I7E1,reviewed,ENG1_RHIMI,"Glucan endo-1,3-beta-D-glucosidase 1 (Endo-1,3...",ENG1 LAM81A,Rhizomucor miehei,796,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,"FUNCTION: Cleaves internal linkages in 1,3-bet...","SUBCELLULAR LOCATION: Secreted, cell wall {ECO...",,Glycosyl hydrolase 81 family
2,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,FUNCTION: [Capsid protein C]: Plays a role in ...,SUBCELLULAR LOCATION: [Capsid protein C]: Viri...,DOMAIN: [Small envelope protein M]: The transm...,Class I-like SAM-binding methyltransferase sup...
3,A0A024SC78,reviewed,CUTI1_HYPJR,Cutinase (EC 3.1.1.74),M419DRAFT_76732,Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,248,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...,FUNCTION: Catalyzes the hydrolysis of complex ...,SUBCELLULAR LOCATION: Secreted {ECO:0000255|Ru...,"DOMAIN: In contract to classical cutinases, po...",Cutinase family
4,A0A024SH76,reviewed,GUX2_HYPJR,"Exoglucanase 2 (EC 3.2.1.91) (1,4-beta-cellobi...",cbh2 M419DRAFT_122470,Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,471,MIVGILTTLATLATLAASVPLEERQACSSVWGQCGGQNWSGPTCCA...,FUNCTION: Exocellobiohydrolases (CBH) that cat...,SUBCELLULAR LOCATION: Secreted {ECO:0000250|Un...,DOMAIN: The enzyme consists of two functional ...,Glycosyl hydrolase 6 (cellulase B) family


In [5]:
df = df.drop(['Reviewed', 'Entry Name', 'Protein names', 'Gene Names'], axis = 1)

# Defining our Llama Model

In [6]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [7]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 5200 #Either I do 35213, or I do 5200 with packing = True
dtype = None
load_in_4bit = True

fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",
    "unsloth/Mistral-Small-Instruct-2409",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",

    "unsloth/Llama-3.2-1B-bnb-4bit",
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.9. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.12.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


# Setting up data for Llama

In [40]:
from helper_funcs.cleaner import cleaner
master_lst = cleaner(df)
master_lst[0]

[{'content': 'What information can you tell me about the protein sequence: MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSPNLADKVALNTSVNSIEEIAHQLADVILNR',
  'role': 'user'},
 {'content': "This sequence NAD hydrolase that catalyzes cleavage of NAD into ADP-D-ribose and nicotinamide. In addition to ADPR, also generates a cyclization variant of cyclic ADPR, termed 2'cADPR. Cleaves NADP, but does not cyclize the product. The TIR domain mediates NAD hydrolase activity. Self-association of TIR domains is required for NADase activity. The TIR domain alone is active and produces cADPR.",
  'role': 'assistant'}]

In [44]:
from datasets import Dataset
ans = {'conversations':master_lst}
df_h = Dataset.from_dict(ans)
df_h

Dataset({
    features: ['conversations'],
    num_rows: 572619
})

In [45]:
df_h[5]['conversations']

[{'content': 'What information can you tell me about the protein sequence: MMKMKQQGLVADLLPNIRVMKTFGHFVFNYYNDNSSKYLHKVYCCVNLFMLLLQFGLCAVNLIVESADVDDLTANTITLLFFTHSIVKICYFAIRSKYFYRTWAIWNNPNSHPLFAESNARYHAIALKKMRLLLFLVGGTTMLAAVAWTVLTFFEHPIRKIVDPVTNETEIIELPQLLIRSFYPFDAGKGITHVLVLVYQFYWVLFMLIDANSLDVLFCSWLLFACEQLQHLKQIMKPLMELSATLDTVVPNSSELFKAGSADHLRDGDNPPPPPPPQSDNMLDLDLRNIYSNRQDFTATFRPTAGMTFNGGVGPNGLTKKQEALVRSAIKYWVERHKHIVRLVTAVGDAYGFALLLHMLTTTITLTLLAYQATKVNGINVYAASTIGYILYTFGQVFLFCIFGNRLIEESTSVMEAAYSCHWYDGSEEAKTFVQIVCQQCQKAMSISGAKFFTVSLDLFASVLGAVVTYFMVLVQLK',
  'role': 'user'},
 {'content': 'This sequence Odorant coreceptor which complexes with conventional odorant receptors to form odorant-sensing units, providing sensitive and prolonged odorant signaling and calcium permeability. Obligate coreceptor of all odorant receptors. Orco is a universal and integral part of the functional odorant receptor, involved in the dendritic localization of other olfactory receptors. Can form functional ion chan

In [46]:
from helper_funcs.format_LM import formatter_LLM
tokenizer = formatter_LLM(tokenizer, list("ACDEFGHIKLMNPQRSTVWY"))
model.resize_token_embeddings(len(tokenizer))

Embedding(128256, 3072, padding_idx=128004)

In [47]:
sample_sequences = ["MSLEQKKGADIISKIL", "ACDEFGHIKLMNPQRSTVWY", "GTTSPSTLKTKLSEISR"]
for seq in sample_sequences:
    tokens = tokenizer.tokenize(seq)
    print(f"Original: {seq}")
    print(f"Tokens: {tokens}")
    print(f"Decoded: {tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens))}")
    print("-" * 20)

Original: MSLEQKKGADIISKIL
Tokens: ['M', 'S', 'L', 'E', 'Q', 'K', 'K', 'G', 'A', 'D', 'I', 'I', 'S', 'K', 'I', 'L']
Decoded: MSLEQKKGADIISKIL
--------------------
Original: ACDEFGHIKLMNPQRSTVWY
Tokens: ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
Decoded: ACDEFGHIKLMNPQRSTVWY
--------------------
Original: GTTSPSTLKTKLSEISR
Tokens: ['G', 'T', 'T', 'S', 'P', 'S', 'T', 'L', 'K', 'T', 'K', 'L', 'S', 'E', 'I', 'S', 'R']
Decoded: GTTSPSTLKTKLSEISR
--------------------


In [48]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    '''
    This function formats the chat template required for Llama to be able to read the data into the correct format.
    '''
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

In [49]:
dataset = df_h.map(formatting_prompts_func, batched = True) #, batched = True,

Map:   0%|          | 0/572619 [00:00<?, ? examples/s]

In [50]:
dataset[10000]['text']

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 July 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat information can you tell me about the protein sequence: MYGIEYTTVLTFLISIILLNYILKSLTRIMDCIIYRLLFIIVILSPFLRAQNYGINLPITGSMDTAYANSTQEETFLTSTLCLYYPTEAATEINDNSWKDTLSQLFLTKGWPTGSVYFKEYTNIASFSVDPQLYCDYNVVLMKYDATLQLDMSELADLILNEWLCNPMDITLYYYQQTDEANKWISMGSSCTIKVCPLNTQTLGIGCLTTDATTFEEVATAEKLVITDVVDGVNHKLDVTTATCTIRNCKKLGPRENVAVIQVGGSDILDITADPTTAPQTERMMRINWKKWWQVFYTVVDYVDQIIQVMSKRSRSLNSAAFYYRV<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThis sequence Calcium-binding protein that interacts with rotavirus cell receptors once the initial attachment by VP4 has been achieved. Rotavirus attachment and entry into the host cell probably involves multiple sequential contacts between the outer capsid proteins VP4 and VP7, and the cell receptors. Following entry into the host cell, low intracellular or intrave

In [51]:
train_test_split = dataset.train_test_split(test_size=0.01)  # 1% for testing
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Training the Model

In [26]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    #eval_dataset = val_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = True,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        #num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 100, #60
        #eval_strategy="no",
        #eval_steps = 25,
        #load_best_model_at_end = True,
        learning_rate = 1e-3, #2e-4
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01, #0.01
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        #log_level = "info",
        report_to = "none", # Use this for WandB etc
    ),
    #callbacks = [early_stopping]
)

Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [27]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/56466 [00:00<?, ? examples/s]

In [28]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'YGLIYHASLVGQTSPKHKGKISRMLAAKTVLAIRYDAFGEDSSSAMGAENRAKLEARLRILEDRGIRKISGTGKALAKAEKYEHKSEVKTYDPSGDSTLPTCSKKRKIEEVDKEDEITEKKAKKAKIKIKAEVEEEMEEAEEEQVVEEEPTVKKKKKKDKKKHIKEEPLSEEEPCTSTAVPSPEKKKKKKKKKDAED<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThis sequence Required for 60S ribosomal subunit biogenesis. Core component of box C/D small nucleolar ribonucleoprotein particles. Required for the biogenesis of box C/D snoRNAs such as U3, U8 and U14 snoRNAs. Part of the small subunit processome, first precursor of the small eukaryotic ribosomal subunit. During the assembly of the SSU processome in the nucleolus, many ribosome biogenesis factors, an RNA chaperone and ribosomal proteins associate with the nascent pre-rRNA and work in concert to generate RNA folding, modifications, rearrangements and cleavage as well as targeted degradation of pre-ribosomal RNA by the RNA exosome. Nucleus, nucleolus. Nucleus, nucleoplasm. Note=Localizes to the nucleolus with a minor part present in t

In [29]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                                                                                                                                                                         \n\nThis sequence Required for 60S ribosomal subunit biogenesis. Core component of box C/D small nucleolar ribonucleoprotein particles. Required for the biogenesis of box C/D snoRNAs such as U3, U8 and U14 snoRNAs. Part of the small subunit processome, first precursor of the small eukaryotic ribosomal subunit. During the assembly of the SSU processome in the nucleolus, many ribosome biogenesis factors, an RNA chaperone and ribosomal proteins associate with the nascent pre-rRNA and work in concert to generate RNA folding, modifications, rearrangements and cleavage as well as targeted degradation of pre-ribosomal RNA by the RNA exosome. Nucleus, nucleolus. Nucleus, nucleoplasm. Note=Localizes to the nucleolus with a minor part present in the nucleoplas. NOP5/NOP56 famil.<|eot_id|><|eot_id|

In [30]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 56,466 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,3.9864
2,4.2508
3,3.7084
4,2.8276
5,2.0899
6,2.0392
7,2.0415
8,1.8639
9,1.6915
10,1.6536


# Saving Model

In [31]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [32]:
import shutil
from google.colab import files
shutil.make_archive('lora_model', 'zip', 'lora_model')
files.download('lora_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Testing

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

In [78]:
import numpy as np
np.random.seed(101)
from transformers import TextStreamer
for i in list(np.random.choice(range(5076), 6)):
  messages = [
  {"role": "user", "content": f"{test_dataset[int(i)]['conversations'][0]['content']}"},
  ]
  inputs = tokenizer.apply_chat_template(
      messages,
      tokenize = True,
      add_generation_prompt = True,
      return_tensors = "pt",
  ).to("cuda")
  print(f"Question: {test_dataset[int(i)]['conversations'][0]['content']}")
  print("Predicted Answer:")
  text_streamer = TextStreamer(tokenizer, skip_prompt = True)
  _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                    use_cache = True, temperature = 1.5, min_p = 0.1)
  print(f"Actual Answer: {test_dataset[int(i)]['conversations'][1]['content']}")
  print()


Question: What information can you tell me about the protein sequence: MINSLLTRLFGSRNERQLRQLNSIVAKINALETELQKLSDTALQAKTTEFKQSIQDGKSLDKLLPEAFAVCREASRRVLGMRHYDVQLIGGMVLHLGKIAEMRTGEGKTLVATLPVYLNALAGNGVHVVTVNDYLARRDAAHMGRLYNWLGLSVGVVYPGMPHSDKHAAYGADITYGTNNEFGFDYLRDNMALSKADRYQRGLHYAIVDEVDSILIDEARTPLIISGPADESPDLYIRVNRIIPHLTRQENEEAEGDYWVDEKGKQVHLSEVGMERAEELLHQAGILGEEDDSLYAAQNLSVVHHLNAALRAHALYQRDVDYIVRDGEVVIVDEFTGRTLAGRRWSDGLHQAIEAKEGVPVQRENQTLASITFQNLFRIYKKLSGMTGTADTEAYEFQSIYGLEVMVIPTNRPTVRKDYPDQVFLNRSSKFNAVLEDIKDCAQRGQPVLVGTTSIEISEMLSEHLRKAKVKHEVLNAKQHEREATIVANAGLPGAVTIATNMAGRGTDIVLGGSLDTVLAELDPDATEEDRFRVKTAWNRRHEAVKAAGGLHIIGTERHESRRIDNQLRGRAGRQGDPGSSRFYLSLEDSLMRIFASEWVQKVMRLMGMKEGDVIEDRRVTRQIERAQRKVEAHNFDIRKNLLDYDDVNNEQRKVVYAQRDELLDAESIKENIDSIRHEVIDALVTRFVPEHSIDEQWDLPGLQATLQSEWGLHLPLIEMLKGREEVDAERIAFLVQDAVDKHCAEREASIGAETMRALEKHVMLTVLDQGWKEHLATMDYLRQGIHLRGYAQKQPKQEYKREAFELFSEMLEHVKREVIASLARVRIRSEEEMAALEEQERRQVDTLLRQSQFQHQEAGGYGTGDEAVSLQRQLAGQGAAIAQVIRDTPKVGRNDPCPCGSGKKYKHCHGLVT
Predicted Answ