# Install

In [2]:
pip install accelerate peft bitsandbytes transformers trl

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: flatbuffers 1.12.1-git20200711.33e2d80-dfsg1-0.6 has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of flatbuffers or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Import

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer
from transformers import BitsAndBytesConfig
import torch

2024-06-07 08:05:38.142461: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Dataset

In [2]:
df = pd.read_csv('empref_df_cleaned.csv')

def check_and_prepare_data(df):
    if isinstance(df['text'].iloc[0], list):
        df['text'] = df['text'].apply(lambda x: ' '.join(x))
    return df

df = check_and_prepare_data(df)

df['er'] = df['er'].fillna(0).astype(int)
df['in'] = df['in'].fillna(0).astype(int)
df['ex'] = df['ex'].fillna(0).astype(int)

dialact_dict = {
    0: "acknowledging",
    1: "agreeing",
    2: "consoling",
    3: "encouraging",
    4: "questioning",
    5: "sympathizing",
    6: "wishing",
    7: "neutral/suggesting"
}
emotion_dict = {
    0: "admiration/love/pride/gratitude",
    1: "anger/annoyance/disgust/disapproval",
    2: "approval/optimism",
    3: "caring/desire",
    4: "fear/nervousness",
    5: "joy/amusement/excitement/relief",
    6: "sadness/disappointment/embarrassment/grief/remorse",
    7: "surprise/confusion/curiosity/realization",
    8: "neutral"
}

def preprocess_for_llama2_chat(df):
    training_examples = []

    for dialog_id in df['dialog_id'].unique():
        dialog_df = df[df['dialog_id'] == dialog_id]

        sys_rows = dialog_df[dialog_df['con/res'] == 'sys']

        for index, sys_row in sys_rows.iterrows():
            context_df = dialog_df.loc[:index-1]
            
            context = ' '.join(
                f"<{row['author'].capitalize()}>: (emotion: {emotion_dict[row['emotion']]}, intent: {dialact_dict[row['dialact']]}) {row['text']}"
                for _, row in context_df.iterrows()
            )
            
            response = (
                f"<{sys_row['author'].capitalize()}>: (emotion: {emotion_dict[sys_row['emotion']]}, intent: {dialact_dict[sys_row['dialact']]}, "
                f"er: {sys_row['er']}, in: {sys_row['in']}, ex: {sys_row['ex']}) {sys_row['text']}"
            )
            
            formatted_input = f"<s>[INST]{context}[/INST] {response}</s>"
            training_examples.append(formatted_input)

    return training_examples

formatted_training_data = preprocess_for_llama2_chat(df)

In [3]:
formatted_training_data[:3]

["<s>[INST]<Speaker>: (emotion: approval/optimism, intent: questioning) I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone? <Listener>: (emotion: admiration/love/pride/gratitude, intent: sympathizing) Hello, and thank you for your question and seeking advice on this. <Listener>: (emotion: sadness/disappointment/embarrassment/grief/remorse, intent: agreeing) Feelings of worthlessness is unfortunately common. <Listener>: (emotion: neutral, intent: agreeing) In fact, most people, if not all, have felt this to some degree at some point in their life. <Listener>: (emotion: neutral, intent: agreeing) You are not alone. <Listener>: (emotion: neutral, intent: neutral/suggesting) Changing our feelings is like changing ou

In [4]:
data_dict = {"text": formatted_training_data}
full_dataset = Dataset.from_dict(data_dict)

train_dataset, temp_dataset = full_dataset.train_test_split(test_size=0.15, seed=42).values()
valid_dataset, test_dataset = temp_dataset.train_test_split(test_size=1/3, seed=42).values()

datasets = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

# Model

## login

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## loading

In [5]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

base_model_name = "meta-llama/Llama-2-7b-chat-hf"
refined_model = "llama-2-7b-reflection-prepend" #You can give it your own name

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# # Quantization Config
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=False
# )

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    # quantization_config=quant_config,
    device_map="auto"
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_prepend",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    # optim="paged_adamw_32bit",
    # save_steps=200,
    # logging_steps=100,
    save_strategy="epoch",
    evaluation_strategy = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model="loss",
    greater_is_better=False,
    learning_rate=5e-4, #2e-4,
    weight_decay= 0.01, #0.001,*
    fp16=True,
    # bf16=False,
    # per_device_eval_batch_size=8,
    # max_grad_norm=0.3,
    # max_steps=-1,
    warmup_ratio=0.01,
    group_by_length=True,
    lr_scheduler_type= "linear", # "constant",
    report_to="tensorboard",
    # group_by_length=True,  # Group by length for efficiency
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=datasets['train'],
    eval_dataset=datasets['valid'],
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params,)

fine_tuning.train()

fine_tuning.model.save_pretrained(refined_model)



Map:   0%|          | 0/1572 [00:00<?, ? examples/s]

Map:   0%|          | 0/185 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.747095
2,0.993700,0.613119
3,0.672100,0.50777
4,0.410300,0.464325
5,0.410300,0.447785
