# Dataset

In [None]:
import pandas as pd
file_path = 'empref_df_final_full.csv'
dataset = pd.read_csv(file_path)
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset_cleaned = dataset.drop(columns=['Unnamed: 0', 'index', 'speaker_name'])

dataset_cleaned = dataset_cleaned.drop(columns=['segment_id'])

dataset_cleaned.head()

In [None]:
dataset_cleaned.shape

In [None]:
save_path = 'empref_df_cleaned.csv'
dataset_cleaned.to_csv(save_path, index=False)

In [None]:
total_dialogs = dataset_cleaned['dialog_id'].nunique()
print(total_dialogs)

# 2. Maximum and Average Turn Count
max_turn_count = dataset_cleaned['turn_count'].max()
avg_turn_count = dataset_cleaned['turn_count'].mean()
print(max_turn_count, avg_turn_count)

# 3. Average Turn Number for 'sys' Entries
avg_turn_sys = dataset_cleaned[dataset_cleaned['con/res'] == 'sys']['turn'].mean()
max_turn_sys = dataset_cleaned[dataset_cleaned['con/res'] == 'sys']['turn'].max()
print(max_turn_sys, avg_turn_sys)

In [None]:
dataset_cleaned = pd.read_csv('empref_df_cleaned.csv')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define the dictionaries
dialact_dict = {
    0: "acknowledging",
    1: "agreeing",
    2: "consoling",
    3: "encouraging",
    4: "questioning",
    5: "sympathizing",
    6: "wishing",
    7: "neutral/suggesting"
}

emotion_dict = {
    0: "admiration/love/pride/gratitude",
    1: "anger/annoyance/disgust/disapproval",
    2: "approval/optimism",
    3: "caring/desire",
    4: "fear/nervousness",
    5: "joy/amusement/excitement/relief",
    6: "sadness/disappointment/embarrassment/grief/remorse",
    7: "surprise/confusion/curiosity/realization",
    8: "neutral"
}

dataset_cleaned['dialact_text'] = dataset_cleaned['dialact'].map(dialact_dict)
dataset_cleaned['emotion_text'] = dataset_cleaned['emotion'].map(emotion_dict)

dataset_cleaned['role'] = dataset_cleaned.apply(
    lambda row: 'sys' if row['con/res'] == 'sys' else ('listener' if row['author'] == 'listener' else 'speaker'),
    axis=1
)

def compute_normalized_counts(df, category):
    counts = df.groupby(['role', category]).size().reset_index(name='count')
    total_counts = counts.groupby('role')['count'].sum().reset_index(name='total_count')
    merged_counts = counts.merge(total_counts, on='role')
    merged_counts['proportion'] = merged_counts['count'] / merged_counts['total_count']
    return merged_counts

normalized_dialact = compute_normalized_counts(dataset_cleaned, 'dialact_text')


plt.figure(figsize=(12, 8))
sns.barplot(y='dialact_text', x='proportion', hue='role', data=normalized_dialact, ci=None)
plt.title('Normalized Distribution of Dialact Values by Role')
plt.xlabel('Proportion')
plt.ylabel('Dialact')
plt.legend(title='Role')
plt.show()


normalized_emotion = compute_normalized_counts(dataset_cleaned, 'emotion_text')

plt.figure(figsize=(12, 8))
sns.barplot(y='emotion_text', x='proportion', hue='role', data=normalized_emotion, ci=None)
plt.title('Normalized Distribution of Emotion Values by Role')
plt.xlabel('Proportion')
plt.ylabel('Emotion')
plt.legend(title='Role')
plt.show()

# Requirements

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    AutoConfig
)
from peft import LoraConfig
from trl import SFTTrainer
import pandas as pd
import torch.nn as nn
from datasets import Dataset, DatasetDict
from transformers.models.llama.modeling_llama import LlamaForCausalLM

# data preprocessing

In [None]:
import pandas as pd
df = pd.read_csv('empref_df_cleaned.csv')

In [None]:
df

In [None]:
author_counts = df['author'].value_counts()

con_res_counts = df['con/res'].value_counts()

author_counts, con_res_counts

## base

In [None]:
def preprocess_for_llama2_chat(df):
    training_examples = []

    for dialog_id in df['dialog_id'].unique():
        dialog_df = df[df['dialog_id'] == dialog_id]

        # Filter out rows where 'con/res' is 'sys', indicating a target response
        sys_rows = dialog_df[dialog_df['con/res'] == 'sys']

        for index, sys_row in sys_rows.iterrows():
            # Get all preceding context up to (but not including) the current sys row
            context_df = dialog_df.loc[:index-1]
            
            # Format the context with author tags
            context = ' '.join(f"<{row['author'].capitalize()}>: {row['text']}" for _, row in context_df.iterrows())
            
            # Prepare the target response
            response = f"<{sys_row['author'].capitalize()}>: {sys_row['text']}"
            
            # Format according to LLaMA-2-chat expected input
            formatted_input = f"<s>[INST]{context}[/INST] {response}</s>"
            training_examples.append(formatted_input)

    return training_examples


formatted_training_data = preprocess_for_llama2_chat(df)

In [None]:
formatted_training_data[0]

In [None]:
len(formatted_training_data)

In [None]:
from datasets import Dataset, DatasetDict

data_dict = {
    "text": formatted_training_data  # Ensure this is a list of strings
}
full_dataset = Dataset.from_dict(data_dict)

train_dataset, temp_dataset = full_dataset.train_test_split(test_size=0.15, seed=42).values()

valid_dataset, test_dataset = temp_dataset.train_test_split(test_size=1/3, seed=42).values()

datasets = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

In [None]:
datasets['train']

In [None]:
datasets['valid']

In [None]:
datasets['test']

## EmpRef

In [None]:
from transformers import AutoModelForCausalLM
# Define the custom model with device_map="auto"
base_model_name = "meta-llama/Llama-2-7b-chat-hf"

llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

In [None]:
df = pd.read_csv('empref_df_cleaned.csv')
​
def check_and_prepare_data(df):
    if isinstance(df['text'].iloc[0], list):
        df['text'] = df['text'].apply(lambda x: ' '.join(x))
    return df

df = check_and_prepare_data(df)
df['er'] = df['er'].fillna(0).astype(int)
df['in'] = df['in'].fillna(0).astype(int)
df['ex'] = df['ex'].fillna(0).astype(int)

dialact_dict = {
    0: "acknowledging",
    1: "agreeing",
    2: "consoling",
    3: "encouraging",
    4: "questioning",
    5: "sympathizing",
    6: "wishing",
    7: "neutral/suggesting"
}
emotion_dict = {
    0: "admiration/love/pride/gratitude",
    1: "anger/annoyance/disgust/disapproval",
    2: "approval/optimism",
    3: "caring/desire",
    4: "fear/nervousness",
    5: "joy/amusement/excitement/relief",
    6: "sadness/disappointment/embarrassment/grief/remorse",
    7: "surprise/confusion/curiosity/realization",
    8: "neutral"
}

# Preprocess the data to include all necessary labels
def preprocess_for_custom_model(df, tokenizer):
    def get_labels(row):
        return {
            "intent_ids": row['dialact'],
            "emotion_ids": row['emotion'],
            "er_ids": row['er'],
            "in_ids": row['in'],
            "ex_ids": row['ex']
        }
​
    inputs = []
    labels = []
    for dialog_id in df['dialog_id'].unique():
        dialog_df = df[df['dialog_id'] == dialog_id]
        sys_rows = dialog_df[dialog_df['con/res'] == 'sys']
​
        for index, sys_row in sys_rows.iterrows():
            context_df = dialog_df.loc[:index - 1]
            context = ' '.join(
                f"<{row['author'].capitalize()}>: (emotion: {emotion_dict[row['emotion']]}, intent: {dialact_dict[row['dialact']]}) {row['text']}"
                for _, row in context_df.iterrows()
            )
            response = (
                f"<{sys_row['author'].capitalize()}>: (emotion: {emotion_dict[sys_row['emotion']]}, intent: {dialact_dict[sys_row['dialact']]}, "
                f"er: {sys_row['er']}, in: {sys_row['in']}, ex: {sys_row['ex']}) {sys_row['text']}"
            )
​
            formatted_input = f"<s>[INST]{context}[/INST] {response}</s>"
            tokenized_input = tokenizer(formatted_input, return_tensors='pt', padding=True, truncation=True)
            inputs.append({
                "input_ids": tokenized_input["input_ids"].squeeze(0),
                "attention_mask": tokenized_input["attention_mask"].squeeze(0),
                "text": formatted_input
            })
            labels.append(get_labels(sys_row))
​
    dataset = Dataset.from_dict({
        "input_ids": [x["input_ids"] for x in inputs],
        "attention_mask": [x["attention_mask"] for x in inputs],
        "text": [x["text"] for x in inputs],
        "intent_ids": [x["intent_ids"] for x in labels],
        "emotion_ids": [x["emotion_ids"] for x in labels],
        "er_ids": [x["er_ids"] for x in labels],
        "in_ids": [x["in_ids"] for x in labels],
        "ex_ids": [x["ex_ids"] for x in labels]
    })
    return dataset

formatted_training_data = preprocess_for_custom_model(df, llama_tokenizer)
​
# Splitting the dataset
train_dataset, temp_dataset = formatted_training_data.train_test_split(test_size=0.15, seed=42).values()
valid_dataset, test_dataset = temp_dataset.train_test_split(test_size=1/3, seed=42).values()
​
datasets = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

# loading base model and tokenizer from HF

## base model loading

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

base_model_name = "meta-llama/Llama-2-7b-chat-hf"
#refined_model = "llama-2-7b-reflection-finetuned" #You can give it your own name

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# # Quantization Config
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=False
# )

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    # quantization_config=quant_config,
    device_map="auto"
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
base_model.save_pretrained('./llama_local')
llama_tokenizer.save_pretrained('./llama_local')

# push to hub

In [None]:
from peft import PeftModel
# Reload model in FP16 and merge it with LoRA weights
load_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

model = PeftModel.from_pretrained(load_model, refined_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
model.push_to_hub(refined_model, use_temp_dir=False)
tokenizer.push_to_hub(refined_model, use_temp_dir=False)

# reload

In [None]:
refined_model = "llama-2-7b-reflection-finetuned"

In [None]:
fine_tuned_model = AutoModelForCausalLM.from_pretrained(refined_model)

In [None]:
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(refined_model)

# Training with LoRa

## Base model fine-tuning

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    # optim="paged_adamw_32bit",
    # save_steps=200,
    # logging_steps=100,
    save_strategy="epoch",
    evaluation_strategy = "epoch",
    load_best_model_at_end = True,
    metric_for_best_model="loss",
    greater_is_better=False,
    learning_rate=5e-4, #2e-4,
    weight_decay= 0.01, #0.001,*
    fp16=True,
    # bf16=False,
    # per_device_eval_batch_size=8,
    # max_grad_norm=0.3,
    # max_steps=-1,
    warmup_ratio=0.01,
    group_by_length=True,
    lr_scheduler_type= "linear", # "constant",
    report_to="tensorboard",
    # group_by_length=True,  # Group by length for efficiency
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=datasets['train'],
    eval_dataset=datasets['valid'],
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params,)

# Training
fine_tuning.train()

# Save Model
fine_tuning.model.save_pretrained(refined_model)

## EmpRef fine-tuning

In [None]:
class CustomLLaMA(LlamaForCausalLM):
    def __init__(self, config, num_intents, num_emotions, embedding_dim):
        super().__init__(config)
        self.intent_embeddings = nn.Embedding(num_intents, embedding_dim)
        self.emotion_embeddings = nn.Embedding(num_emotions, embedding_dim)
        enhanced_dim = config.hidden_size + 2 * embedding_dim
        self.emotion_head = nn.Linear(enhanced_dim, num_emotions)
        self.intent_head = nn.Linear(enhanced_dim, num_intents)
        self.er_head = nn.Linear(config.hidden_size, 2)
        self.ex_head = nn.Linear(config.hidden_size, 2)
        self.in_head = nn.Linear(config.hidden_size, 2)

        self.criterion_nll = nn.CrossEntropyLoss()
        self.criterion_aux = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask=None, labels=None, intent_ids=None, emotion_ids=None, er_ids=None, in_ids=None, ex_ids=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]

        if labels is not None:
            intent_embeds = self.intent_embeddings(intent_ids)
            emotion_embeds = self.emotion_embeddings(emotion_ids)
            enhanced_hidden_states = torch.cat((hidden_states, intent_embeds, emotion_embeds), dim=-1)

            emotion_logits = self.emotion_head(enhanced_hidden_states)
            intent_logits = self.intent_head(enhanced_hidden_states)
            er_logits = self.er_head(hidden_states)
            in_logits = self.in_head(hidden_states)
            ex_logits = self.ex_head(hidden_states)

            total_loss = (
                self.criterion_aux(emotion_logits, emotion_ids) +
                self.criterion_aux(intent_logits, intent_ids) +
                self.criterion_aux(er_logits, er_ids) +
                self.criterion_aux(in_logits, in_ids) +
                self.criterion_aux(ex_logits, ex_ids)
            )
            return {
                "loss": total_loss,
                "emotion_logits": emotion_logits,
                "intent_logits": intent_logits,
                "er_logits": er_logits,
                "in_logits": in_logits,
                "ex_logits": ex_logits
            }
        else:
            return {"hidden_states": hidden_states}

In [None]:
config = AutoConfig.from_pretrained(base_model_name)

In [None]:
custom_model = CustomLLaMA(
    config=config,
    num_intents=9,
    num_emotions=9,
    embedding_dim=50
).to("cuda")

In [None]:
# Trainer
class CustomSFTTrainer(SFTTrainer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.model = kwargs['model']

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        intent_ids = inputs.pop("intent_ids")
        emotion_ids = inputs.pop("emotion_ids")
        er_ids = inputs.pop("er_ids")
        in_ids = inputs.pop("in_ids")
        ex_ids = inputs.pop("ex_ids")

        outputs = model(**inputs, labels=labels, intent_ids=intent_ids, emotion_ids=emotion_ids, er_ids=er_ids, in_ids=in_ids, ex_ids=ex_ids)
        loss = outputs["loss"]

        return (loss, outputs) if return_outputs else loss

In [None]:
# Define LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=16,
    bias="none",
    task_type="CAUSAL_LM"
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results_empref",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    learning_rate=5e-4,
    weight_decay=0.01,
    fp16=True,
    warmup_ratio=0.01,
    group_by_length=True,
    lr_scheduler_type="linear",
    report_to="tensorboard",
)

# Trainer
fine_tuning = CustomSFTTrainer(
    model=custom_model,
    train_dataset=datasets['train'],
    eval_dataset=datasets['valid'],
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=training_args,
)

# Training
fine_tuning.train()

# Save Model
refined_model = "llama-2-7b-reflection-empref"
fine_tuning.model.save_pretrained(refined_model)