In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [None]:
from datasets import load_from_disk, Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [None]:
# using GPU if you have one
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"

## 1. Load Dataset and Prepare Data for Weak Supervision

In [None]:
dataset = load_from_disk("../input/chat-weak/chat_weak")

In [None]:
dataset.save_to_disk("/kaggle/working/chat_weak")

In [None]:
dataset = load_from_disk("/kaggle/working/chat_weak")

## 2. Load Model

In [None]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## 2. Instruction Fine-tuning

In [None]:
def tokenize_function(example):
    start_prompt = 'Evaluate the sentiment of the following sentence.\n\n'
    end_prompt = '\n\nSentiment: '
    prompt = [start_prompt + message + end_prompt for message in example["message"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["sentiment"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['conversation_id', 'message', 'sentiment', '__index_level_0__', 'speaker_id', ])


In [None]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['val'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)


### 2.1 Fine-tune Model

In [None]:
output_dir = f'/kaggle/working/model/instruction-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=500,
    evaluation_strategy = "steps",
    save_total_limit=1,
    load_best_model_at_end=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val']
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
import zipfile
import os
from IPython.display import FileLink

def zip_dir(directory = os.curdir, file_name = 'directory.zip'):
    """
    zip all the files in a directory
    
    Parameters
    _____
    directory: str
        directory needs to be zipped, defualt is current working directory
        
    file_name: str
        the name of the zipped file (including .zip), default is 'directory.zip'
        
    Returns
    _____
    Creates a hyperlink, which can be used to download the zip file)
    """
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)

In [None]:
zip_dir()

## 3. LoRA

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

In [None]:
lora_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(lora_model))

In [None]:
output_dir = f'/kaggle/working/model/lora-{str(int(time.time()))}'

lora_training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=500,
    evaluation_strategy = "steps",
    save_total_limit=1,
    load_best_model_at_end=True,
#     auto_find_batch_size=True
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2
)

lora_trainer = Trainer(
    model=lora_model,
    args=lora_training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val']
)

In [None]:
lora_trainer.train()

lora_model_path="/kaggle/working/model/lora-checkpoint"

lora_trainer.model.save_pretrained(lora_model_path)
tokenizer.save_pretrained(lora_model_path)