In [None]:
!pip install peft huggingface_hub
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes

In [3]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [35]:
from datasets import load_dataset, DatasetDict
from datasets import load_dataset

from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments, pipeline
from peft import LoraConfig
from trl import SFTTrainer

import torch

import re
from tqdm import tqdm

In [None]:
# Load Dataset

raw_train_sets = load_dataset("glue", "sst2", split="train")
raw_val_sets = load_dataset('glue', 'sst2', split='validation')

In [6]:
def create_label_text(label):
    label_map = {
        0 : 'negative',
        1 : 'positive',
    }

    return {'label_text': label_map[label]}

# Data pro-pocessing
We need to modify sentence's labels to {0 : 'negative', 1 : 'positive'}. Because in prompt-tuning, the vocabulary is easier for model to learn.


In [7]:
# Convert digital labels to text labels
train_sets = raw_train_sets.map(create_label_text, input_columns=['label'])
val_sets = raw_val_sets.map(create_label_text, input_columns=['label'])
train_sets

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'label', 'idx', 'label_text'],
    num_rows: 67349
})

In [8]:
train_sets[0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0,
 'label_text': 'negative'}

In [9]:
def convert_into_prompt_template(system_prompt, user_message, train=True, label_text=""):
    # Convert the dataset into the prompt template format as follows:
    # <s>[INST] <<SYS>>
    # {{ system_prompt }}
    # <</SYS>>

    # {{ user_message }} [/INST]
    # Sentiment: {{ label }} </s>      ## label is neccessary for training, but not for inference
    if train:
        text = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\nSentence: {user_message} [/INST]\nSentiment: {label_text} </s>"
    else:
        text = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\nSentence: {user_message} [/INST]\nSentiment: "

    return text

Create the system prompt and user prompt for prompt-tuning.

In [29]:
system_prompt = "You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the user's message into one of the following categories: 'positive' or 'negative'."
user_prompt = "Classify the sentiment of the following sentence into one of the following categories: positive or negative."

In [11]:
def map_dataset(system_prompt, dataset, train=True):
    # Convert the dataset into the format required by the model
    def convert(sentence, label_text):
        converted_inputs = convert_into_prompt_template(system_prompt, sentence, train, label_text)
        return {'text': converted_inputs, 'label_text': label_text}

    return dataset.map(convert, input_columns=['sentence', 'label_text'], batched=False, remove_columns=['sentence', 'label', 'idx', 'label_text'])

In [12]:
processed_train_sets = map_dataset(system_prompt, train_sets, train=True)
processed_val_sets = map_dataset(system_prompt, val_sets, train=False)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [13]:
processed_train_sets[0]

{'label_text': 'negative',
 'text': "<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the user's message into one of the following categories: 'positive' or 'negative'.\n<</SYS>>\n\nSentence: hide new secretions from the parental units  [/INST]\nSentiment: negative </s>"}

In [14]:
processed_val_sets[0]

{'label_text': 'positive',
 'text': "<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the user's message into one of the following categories: 'positive' or 'negative'.\n<</SYS>>\n\nSentence: it 's a charming and often affecting journey .  [/INST]\nSentiment: "}

In [15]:
processed_datasets = DatasetDict({
    'train': processed_train_sets,
    'validation': processed_val_sets
})

In [16]:
processed_datasets

DatasetDict({
    train: Dataset({
        features: ['label_text', 'text'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['label_text', 'text'],
        num_rows: 872
    })
})

In [17]:
train_datasets = processed_datasets["train"]

# Model fine-tuning
Use llama-2 model with 7b parameters. And also use Lora to reduce the GPU memory.

In [23]:
base_model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [24]:
device_map = {"": 0}
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

output_dir = "./results"

In [26]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=10,
    max_steps=50
)

max_seq_length = 256

trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_datasets,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)

In [27]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.9321
20,1.6112
30,0.7113
40,0.5926
50,0.5333


TrainOutput(global_step=50, training_loss=1.2761288642883302, metrics={'train_runtime': 1788.4419, 'train_samples_per_second': 3.579, 'train_steps_per_second': 0.028, 'total_flos': 1.562239314296832e+16, 'train_loss': 1.2761288642883302, 'epoch': 0.1})

# Model evaluation
The accuracy of eval dataset is 93%, better than prompt-tuning of Bloomz-560b model.


In [30]:
def evaluate(dataset):
    label_map = {
        0 : 'negative',
        1 : 'positive',
    }

    compared_result = []

    for i, val in enumerate(tqdm(dataset)):
        label_text = label_map[val['label']]
        sentence = val['sentence']

        # Make input
        text = "[INST] " + system_prompt + "\n<>\n\n" + "Sentence: " + sentence + "[/INST]\n" + "Sentiment: "
        inputs = tokenizer(text, return_tensors="pt").to("cuda")

        # Generate
        outputs = base_model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"], max_new_tokens=80, pad_token_id=tokenizer.eos_token_id)
        outputs_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        matches = re.findall(r"Sentiment: (.+)", outputs_text)
        selected_sentiment = matches[0].strip()
        selected_sentiment = selected_sentiment.split()[0]

        # Compare prediction and label
        if selected_sentiment == label_text:
            compared_result.append(1)
        else:
            compared_result.append(0)

    return compared_result

In [33]:
res_list = evaluate(raw_val_sets.select(range(100)))

  0%|          | 0/100 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
100%|██████████| 100/100 [25:11<00:00, 15.12s/it]


In [34]:
print("Accuracy:", res_list.count(1)/len(res_list))

Accuracy: 0.93


# Example of the emotional classification by llama-2 model
From "Sentiment: positive", we know the classification is correct.


In [49]:
prompt = "<> \n" + system_prompt + "\n<>\n\nSentence: " + raw_val_sets[2]["sentence"]

pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_length=150)
result = pipe(f"[INST] {prompt} [/INST]\nSentiment:")
print(result[0]['generated_text'])



[INST] <> 
You are a helpful, respectful and honest sentiment analysis assistant. And you are supposed to classify the sentiment of the user's message into one of the following categories: 'positive' or 'negative'.
<>

Sentence: allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker .  [/INST]
Sentiment: positive  [/SENT]

</file>
"""


def is_sentence(sentence: str) -> bool:
    """
    :param sentence: 
    :return: 
    """
    return'sentiment'


In [44]:
raw_val_sets[2]

{'sentence': 'allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . ',
 'label': 1,
 'idx': 2}