In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
# fourbit_models = [
#     "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
#     "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
#     "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
#     "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
#     "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
#     "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
#     "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
#     "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
#     "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!d
#     "unsloth/Phi-3-medium-4k-instruct",
#     "unsloth/gemma-2-9b-bnb-4bit",
#     "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
# ] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import json

# Load your custom dataset (adjust the path as necessary)
file_path = '/content/drive/MyDrive/llama3_8b_finetune_data/dl_train_data.json'

with open(file_path, 'r') as f:
    custom_dataset = json.load(f)


In [6]:
from datasets import Dataset
import pandas as pd

# Convert the JSON data to a pandas DataFrame
custom_dataset_df = pd.DataFrame(custom_dataset)

# Convert the pandas DataFrame to a Hugging Face dataset
hf_dataset = Dataset.from_pandas(custom_dataset_df)

In [7]:
import transformers


# Define the EOS token
EOS_TOKEN = tokenizer.eos_token if tokenizer.eos_token else '<|endoftext|>'

# Define the formatting function
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. Only return the triplet in the response.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply the formatting function to the dataset
formatted_dataset = hf_dataset.map(formatting_prompts_func, batched=True)

# Verify the formatting
print(formatted_dataset['text'][0])


Map:   0%|          | 0/5036 [00:00<?, ? examples/s]

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. Only return the triplet in the response.

### Instruction:
In this Gene-Gene relation extraction task,you need to extract the (geneA, relation, geneB) triplet from the text, such as: (AKT, inhibit, TSC). The second element in the triple means the realation that the geneA has with geneB. Types of relations are: inhibit, be inhibited by, activate, be activated by. Please return all the relations extracted from the text in ternary format (GENE, RELATION, GENE). If there are more than one triplet, please write in this form: '(GENE, RELATION, GENE),(GENE, RELATION, GENE),......'. You will be provided with a text consists of a sentence and a gene pair in the format of (geneA, geneB). You need to classify the relation between the gene pair from the sentence and return me a (geneA, relation, geneB) triplet. For example, the text inpput is

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/5036 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.984 GB of memory reserved.


In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 5,036 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.3427
2,2.3651
3,2.3048
4,2.1913
5,1.9922
6,1.7637
7,1.4397
8,1.2983
9,0.9584
10,0.7244


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

441.382 seconds used for training.
7.36 minutes used for training.
Peak reserved memory = 7.902 GB.
Peak reserved memory for training = 1.918 GB.
Peak reserved memory % of max memory = 53.58 %.
Peak reserved memory for training % of max memory = 13.005 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!



 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [6]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "In this Gene-Gene relation extraction task, you need to follow 3 steps. You need to extract the (gene, relation, gene) triplet from the text. The second element in the triple means the relation between the two genes. Types of relations are 'activate', 'be activated by', 'inhibit', and 'be inhibited by'. Please return all the relations extracted from the text in ternary format (GENE, RELATION, GENE). If there are more than one triple, please write in this form: '(GENE, RELATION, GENE),(GENE, RELATION, GENE),......'. Please return me 'None' if there's no gene-gene relationship in the text.", # instruction
        "Atopic dermatitis (AD) is characterized by a defective skin barrier which allows increased allergen and pathogen penetration. Loricrin (LOR) and involucrin (IVL) are proteins important for skin barrier formation and integrity. In this study, we demonstrate that the gene and protein expression of LOR and IVL is significantly decreased in involved (LOR: p<0.001; IVL: p<0.001) and uninvolved (LOR: p<0.001; IVL: p<0.001) skin of AD subjects, as compared to skin from healthy subjects. Using primary keratinocytes, we further demonstrate the down-regulatory effect of IL-4 and IL-13--which are over-expressed in the skin of AD patients--on LOR and IVL expression in keratinocytes. Additionally, skin biopsies from signal transducer and activator of transcription (STAT)-6 transgenic mice were deficient in the expression and production of LOR and IVL. This study suggests that Th2 cytokines inhibit expression of LOR and IVL through a STAT-6 dependent mechanism.", # insput
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
In this Gene-Gene relation extraction task, you need to follow 3 steps. You need to extract the (gene, relation, gene) triplet from the text. The second element in the triple means the relation between the two genes. Types of relations are 'activate', 'be activated by', 'inhibit', and 'be inhibited by'. Please return all the relations extracted from the text in ternary format (GENE, RELATION, GENE). If there are more than one triple, please write in this form: '(GENE, RELATION, GENE),(GENE, RELATION, GENE),......'. Please return me 'None' if there's no gene-gene relationship in the text.

### Input:
Atopic dermatitis (AD) is characterized by a defective skin barrier which allows increased allergen and pathogen penetration. Loricrin (LOR) and involucrin (IVL) are proteins important for skin barrier formation and i

In [10]:
import json
# 定义 Alpaca 格式的 prompt 模板
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 读取 JSONL 文件
input_file = "/content/drive/MyDrive/llama3_8b_finetune_data/IL13_1970-2010_20small.jsonl"
output_file = "/content/drive/MyDrive/llama3_8b_finetune_data/IL13_1970-2010_20small_output.jsonl"

with open(input_file, "r") as f:
    data = [json.loads(line) for line in f]

for entry in data:
    instruction = "In this Gene-Gene relation extraction task, you need to follow 3 steps. You need to extract the (gene, relation, gene) triplet from the text. The second element in the triple means the relation between the two genes. Types of relations are 'activate', 'be activated by', 'inhibit', and 'be inhibited by'. Please return all the relations extracted from the text in ternary format (GENE, RELATION, GENE). If there are more than one triple, please write in this form: '(GENE, RELATION, GENE),(GENE, RELATION, GENE),......'."
    input_text = entry["Abstract"]
    prompt = alpaca_prompt.format(instruction, input_text, "")

    # 生成模型输入
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # 生成输出
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # 提取生成文本的响应部分
    response_text = generated_text.split("### Response:")[-1].strip()

    # 更新 "predict" 字段
    entry["predict"] = response_text

# 将处理后的数据写回文件
with open(output_file, "w") as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")

print(f"Processed data saved to {output_file}")

Processed data saved to /content/drive/MyDrive/llama3_8b_finetune_data/IL13_1970-2010_20small_output.jsonl


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.


In [3]:
model.save_pretrained("/content/drive/MyDrive/lora_model3.1") # Local saving
tokenizer.save_pretrained("/content/drive/MyDrive/lora_model3.1")


KeyboardInterrupt: 

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [6]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/lora_model3.1", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.Only return the triplet in the response.

### Instruction:
{}

### Input:
{}

### Response:
{}"""
inputs = tokenizer(
[
    alpaca_prompt.format(
        "In this Gene-Gene relation extraction task, you need to follow 3 steps. You need to extract the (gene, relation, gene) triplet from the text. The second element in the triple means the relation between the two genes. Types of relations are 'activate', 'be activated by', 'inhibit', and 'be inhibited by'. Please return all the relations extracted from the text in ternary format (GENE, RELATION, GENE). If there are more than one triple, please write in this form: '(GENE, RELATION, GENE),(GENE, RELATION, GENE),......'. Please return me 'None' if there's no gene-gene relationship in the text.", # instruction
        "Atopic dermatitis (AD) is characterized by a defective skin barrier which allows increased allergen and pathogen penetration. Loricrin (LOR) and involucrin (IVL) are proteins important for skin barrier formation and integrity. In this study, we demonstrate that the gene and protein expression of LOR and IVL is significantly decreased in involved (LOR: p<0.001; IVL: p<0.001) and uninvolved (LOR: p<0.001; IVL: p<0.001) skin of AD subjects, as compared to skin from healthy subjects. Using primary keratinocytes, we further demonstrate the down-regulatory effect of IL-4 and IL-13--which are over-expressed in the skin of AD patients--on LOR and IVL expression in keratinocytes. Additionally, skin biopsies from signal transducer and activator of transcription (STAT)-6 transgenic mice were deficient in the expression and production of LOR and IVL. This study suggests that Th2 cytokines inhibit expression of LOR and IVL through a STAT-6 dependent mechanism.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)


==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


["Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.Only return the triplet in the response.\n\n### Instruction:\nIn this Gene-Gene relation extraction task, you need to follow 3 steps. You need to extract the (gene, relation, gene) triplet from the text. The second element in the triple means the relation between the two genes. Types of relations are 'activate', 'be activated by', 'inhibit', and 'be inhibited by'. Please return all the relations extracted from the text in ternary format (GENE, RELATION, GENE). If there are more than one triple, please write in this form: '(GENE, RELATION, GENE),(GENE, RELATION, GENE),......'. Please return me 'None' if there's no gene-gene relationship in the text.\n\n### Input:\nAtopic dermatitis (AD) is characterized by a defective skin barrier which allows increased allergen and pathogen penetration. Loricrin (LOR) and involucrin (IVL) are pro

In [8]:
import json
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.Only return me triplet as the answer.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 读取 JSONL 文件
input_file = "/content/drive/MyDrive/llama3_8b_finetune_data/IL13_1970-2010_abstract.jsonl"
output_file = "/content/drive/MyDrive/llama3_8b_finetune_data/IL13_1970-2010_0-500output3.1.jsonl"

with open(input_file, "r") as f:
    data = [json.loads(line) for line in f]

# 只处理前500条数据
data = data[:500]

for idx, entry in enumerate(data):
    print(f"Processing entry {idx + 1} / {len(data)}")
    instruction = "In this Gene-Gene relation extraction task, you need to follow 3 steps. You need to extract the (gene, relation, gene) triplet from the text. The second element in the triple means the relation between the two genes. Types of relations are 'activate', 'be activated by', 'inhibit', and 'be inhibited by'. Please return all the relations extracted from the text in ternary format (GENE, RELATION, GENE). If there are more than one triple, please write in this form: '(GENE, RELATION, GENE),(GENE, RELATION, GENE),......'."
    input_text = entry["Abstract"]
    prompt = alpaca_prompt.format(instruction, input_text, "")

    # 生成模型输入
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # 生成输出
    with torch.no_grad():
      outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
      generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # 提取生成文本的响应部分
    response_text = generated_text.split("### Response:")[-1].strip()

    # 更新 "predict" 字段
    entry["predict"] = response_text

# 将处理后的数据写回文件
with open(output_file, "w") as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")

print(f"Processed data saved to {output_file}")

Processing entry 1 / 500
Processing entry 2 / 500
Processing entry 3 / 500
Processing entry 4 / 500
Processing entry 5 / 500
Processing entry 6 / 500
Processing entry 7 / 500
Processing entry 8 / 500
Processing entry 9 / 500
Processing entry 10 / 500
Processing entry 11 / 500
Processing entry 12 / 500
Processing entry 13 / 500
Processing entry 14 / 500
Processing entry 15 / 500
Processing entry 16 / 500
Processing entry 17 / 500
Processing entry 18 / 500
Processing entry 19 / 500
Processing entry 20 / 500
Processing entry 21 / 500
Processing entry 22 / 500
Processing entry 23 / 500
Processing entry 24 / 500
Processing entry 25 / 500
Processing entry 26 / 500
Processing entry 27 / 500
Processing entry 28 / 500
Processing entry 29 / 500
Processing entry 30 / 500
Processing entry 31 / 500
Processing entry 32 / 500
Processing entry 33 / 500
Processing entry 34 / 500
Processing entry 35 / 500
Processing entry 36 / 500
Processing entry 37 / 500
Processing entry 38 / 500
Processing entry 39 /