# Intsall

In [None]:
pip install --upgrade typing_extensions

In [None]:
pip install --upgrade datasets transformers torch

# Data Prep

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
import torch

In [None]:
# Load Data
df = pd.read_csv('empref_df_cleaned.csv')

def check_and_prepare_data(df):
    if isinstance(df['text'].iloc[0], list):
        df['text'] = df['text'].apply(lambda x: ' '.join(x))
    return df

df = check_and_prepare_data(df)

df['er'] = df['er'].fillna(0).astype(int)
df['in'] = df['in'].fillna(0).astype(int)
df['ex'] = df['ex'].fillna(0).astype(int)

dialact_dict = {
    0: "acknowledging",
    1: "agreeing",
    2: "consoling",
    3: "encouraging",
    4: "questioning",
    5: "sympathizing",
    6: "wishing",
    7: "neutral/suggesting"
}
emotion_dict = {
    0: "admiration/love/pride/gratitude",
    1: "anger/annoyance/disgust/disapproval",
    2: "approval/optimism",
    3: "caring/desire",
    4: "fear/nervousness",
    5: "joy/amusement/excitement/relief",
    6: "sadness/disappointment/embarrassment/grief/remorse",
    7: "surprise/confusion/curiosity/realization",
    8: "neutral"
}

def preprocess_for_llama2_chat(df):
    training_examples = []

    for dialog_id in df['dialog_id'].unique():
        dialog_df = df[df['dialog_id'] == dialog_id]

        sys_rows = dialog_df[dialog_df['con/res'] == 'sys']

        for index, sys_row in sys_rows.iterrows():
            context_df = dialog_df.loc[:index-1]
            
            context = ' '.join(
                f"<{row['author'].capitalize()}>: (emotion: {emotion_dict[row['emotion']]}, intent: {dialact_dict[row['dialact']]}) {row['text']}"
                for _, row in context_df.iterrows()
            )
            
            response = (
                f"<{sys_row['author'].capitalize()}>: (emotion: {emotion_dict[sys_row['emotion']]}, intent: {dialact_dict[sys_row['dialact']]}, "
                f"er: {sys_row['er']}, in: {sys_row['in']}, ex: {sys_row['ex']}) {sys_row['text']}"
            )
            
            formatted_input = f"<s>[INST]{context}[/INST] {response}</s>"
            training_examples.append(formatted_input)

    return training_examples

formatted_training_data = preprocess_for_llama2_chat(df)

In [None]:
data_dict = {"text": formatted_training_data}
full_dataset = Dataset.from_dict(data_dict)

train_dataset, temp_dataset = full_dataset.train_test_split(test_size=0.15, seed=42).values()
valid_dataset, test_dataset = temp_dataset.train_test_split(test_size=1/3, seed=42).values()

prepend_datasets = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

In [None]:
len(prepend_datasets['train']), len(prepend_datasets['valid']), len(prepend_datasets['test'])

In [None]:
def preprocess_for_llama2_chat_aug(df):
    training_examples = []

    for dialog_id in df['dialog_id'].unique():
        dialog_df = df[df['dialog_id'] == dialog_id]

        sys_rows = dialog_df[dialog_df['con/res'] == 'sys']

        for index, sys_row in sys_rows.iterrows():

            context_df = dialog_df.loc[:index-1]
            

            context = ' '.join(f"<{row['author'].capitalize()}>: {row['text']}" for _, row in context_df.iterrows())
            

            response = f"<{sys_row['author'].capitalize()}>: {sys_row['text']}"
            

            formatted_input = f"<s>[INST]{context}[/INST] {response}</s>"
            training_examples.append(formatted_input)

    return training_examples

formatted_training_data_aug = preprocess_for_llama2_chat_aug(df)

In [None]:
from datasets import Dataset, DatasetDict

data_dict = {
    "text": formatted_training_data
}
full_dataset = Dataset.from_dict(data_dict)


train_dataset, temp_dataset = full_dataset.train_test_split(test_size=0.15, seed=42).values()


valid_dataset, test_dataset = temp_dataset.train_test_split(test_size=1/3, seed=42).values()

datasets = DatasetDict({
    'train': train_dataset,
    'valid': valid_dataset,
    'test': test_dataset
})

# Functions

In [None]:
len(datasets['test']), len(prepend_datasets['test'])

In [None]:
def extract_conversation_context(sample):
    start = sample.find("[INST]") + len("[INST]")
    end = sample.find("[/INST]")
    return sample[start:end].strip()


def extract_target_response(sample):
    end = sample.find("[/INST]") + len("[/INST]")
    return sample[end:].strip()


def generate_response(model, tokenizer, context, max_new_tokens=100):
    input_text = f"<s>[INST]{context}[/INST]"
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs['input_ids'], max_new_tokens=max_new_tokens, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract the response part correctly after [/INST] and truncate after </s>
def extract_response(generated_text):
    end_inst = generated_text.find("[/INST]") + len("[/INST]")
    if end_inst == -1:
        return ""
    response_start = generated_text[end_inst:].strip()
    end_s = response_start.find("</s>")
    if end_s != -1:
        response_start = response_start[:end_s].strip()
    return response_start

# Models

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load base model and tokenizer
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load fine-tuned model
fine_tuned_model_name = "llama-2-7b-reflection-finetuned"
fine_tuned_model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name, device_map="auto")

# Load prepend model
prepend_model_name = "llama-2-7b-reflection-prepend"
prepend_model = AutoModelForCausalLM.from_pretrained(prepend_model_name, device_map="auto")

# Load EmpRef model
prepend_model_name = "llama-2-7b-reflection-empref"
prepend_model = AutoModelForCausalLM.from_pretrained(prepend_model_name, device_map="auto")

# Evaluation

In [None]:
import random
import nltk
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import single_meteor_score

# Download NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

# Randomly sample 20 examples from the datasets
random.seed(42)  # For reproducibility
sample_indices = random.sample(range(len(datasets['test'])), 60)

# Initialize dictionaries to hold generated responses
generated_responses = {
    "base": [],
    "fine_tuned": [],
    "fine_tuned_with_prepend": [],
    "prepend": []
}

# Initialize list to hold target responses
target_responses = {
    "base_and_fine_tuned": [],
    "prepend": []
}

# Iterate over the sampled examples
for idx in sample_indices:
    datasets_test_sample = datasets['test'][idx]['text']
    prepend_datasets_test_sample = prepend_datasets['test'][idx]['text']

    # Extract conversation contexts
    context_base_and_finetuned = extract_conversation_context(datasets_test_sample)
    context_prepend_and_empref = extract_conversation_context(prepend_datasets_test_sample)

    # Extract target responses
    target_response_base_and_finetuned = extract_target_response(datasets_test_sample)
    target_response_prepend = extract_target_response(prepend_datasets_test_sample)

    # Store the target responses
    target_responses["base_and_fine_tuned"].append(target_response_base_and_finetuned)
    target_responses["prepend_and_empref"].append(target_response_prepend)

    # Generate and extract responses
    base_response = generate_response(base_model, tokenizer, context_base_and_finetuned)
    fine_tuned_response = generate_response(fine_tuned_model, tokenizer, context_base_and_finetuned)
    prepend_response = generate_response(prepend_model, tokenizer, context_prepend_and_empref)
    prepend_response = generate_response(empref_model, tokenizer, context_prepend_and_empref)

    # Extract the generated response part
    base_response_extracted = extract_response(base_response)
    fine_tuned_response_extracted = extract_response(fine_tuned_response)
    prepend_response_extracted = extract_response(prepend_response)
    empref_prepend_response_extracted = extract_response(empref_response)

    # Store the generated responses
    generated_responses["base"].append(base_response_extracted)
    generated_responses["fine_tuned"].append(fine_tuned_response_extracted)
    generated_responses["prepend"].append(fine_tuned_with_prepend_response_extracted)
    generated_responses["empref"].append(prepend_response_extracted)

In [None]:
# Define the file path
output_file_path = 'model_responses_comparison.txt'

# Open the file for writing
with open(output_file_path, 'w') as file:
    # Iterate over the sampled examples and their corresponding responses
    for idx in range(len(sample_indices)):
        datasets_test_sample = datasets['test'][sample_indices[idx]]['text']
        prepend_datasets_test_sample = prepend_datasets['test'][sample_indices[idx]]['text']

        context_base_and_finetuned = extract_conversation_context(datasets_test_sample)
        context_prepend_and_empref = extract_conversation_context(prepend_datasets_test_sample)

        target_response_base_and_finetuned = target_responses["base_and_fine_tuned"][idx]
        target_response_prepend = target_responses["prepend_and_empref"][idx]

        base_response_extracted = generated_responses["base"][idx]
        fine_tuned_response_extracted = generated_responses["fine_tuned"][idx]
        prepend_response_extracted = generated_responses["prepend"][idx]
        empref_response_extracted = generated_responses["empref"][idx]

        # Write to file
        file.write(f"Sample {idx + 1}:\n")
        file.write("Conversation Context:\n")
        file.write(context_base_and_finetuned + "\n\n")
        
        file.write("Base Model Response:\n")
        file.write(base_response_extracted + "\n\n")
        
        file.write("Fine-tuned Model Response:\n")
        file.write(fine_tuned_response_extracted + "\n\n")
        
        file.write("Prepend Input Response:\n")
        file.write(prepend_response_extracted + "\n\n")
        
        file.write("EmpRef Model Response:\n")
        file.write(empref_response_extracted + "\n\n")
        
        file.write("Target Response:\n")
        if method in ["base", "fine_tuned"]:
            file.write(target_response_base_and_finetuned + "\n\n")
        else:
            file.write(target_response_prepend_and_empref + "\n\n")
        
        file.write("="*80 + "\n\n")

print(f"Responses have been written to {output_file_path}")

In [None]:
import random
import nltk
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import single_meteor_score

# Initialize metrics
metrics = {
    "bleu_1": {method: [] for method in generated_responses},
    "bleu_2": {method: [] for method in generated_responses},
    "bleu_3": {method: [] for method in generated_responses},
    "bleu_4": {method: [] for method in generated_responses},
    "rouge_l": {method: [] for method in generated_responses},
    "meteor": {method: [] for method in generated_responses},
    "dist_1": {method: [] for method in generated_responses},
    "dist_2": {method: [] for method in generated_responses},
}

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
smoothing_function = SmoothingFunction().method1

# Function to calculate BLEU scores
def calculate_bleu_scores(ref, cand):
    ref, cand = ref.split(), cand.split()
    bleu_1 = sentence_bleu([ref], cand, weights=(1, 0, 0, 0), smoothing_function=smoothing_function)
    bleu_2 = sentence_bleu([ref], cand, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)
    bleu_3 = sentence_bleu([ref], cand, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing_function)
    bleu_4 = sentence_bleu([ref], cand, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)
    return bleu_1, bleu_2, bleu_3, bleu_4

# Function to calculate ROUGE-L score
def calculate_rouge_l_score(ref, cand):
    scores = scorer.score(ref, cand)
    return scores['rougeL'].fmeasure

# Function to calculate METEOR score
def calculate_meteor_score(ref, cand):
    return single_meteor_score(' '.join(ref.split()), ' '.join(cand.split()))

# Function to calculate Distinct-1 and Distinct-2 scores
def calculate_distinct_scores(cand):
    unigrams = cand.split()
    bigrams = list(zip(unigrams[:-1], unigrams[1:]))
    dist_1 = len(set(unigrams)) / len(unigrams) if len(unigrams) != 0 else 0
    dist_2 = len(set(bigrams)) / len(bigrams) if len(bigrams) != 0 else 0
    return dist_1, dist_2

In [None]:
# Evaluate each method
for method, responses in generated_responses.items():
    if method in ["base", "fine_tuned"]:
        target_list = target_responses["base_and_fine_tuned"]
    else:
        target_list = target_responses["prepend_and_empref"]
        
    for ref, cand in zip(target_list, responses):
        # BLEU
        bleu_1, bleu_2, bleu_3, bleu_4 = calculate_bleu_scores(ref, cand)
        metrics["bleu_1"][method].append(bleu_1)
        metrics["bleu_2"][method].append(bleu_2)
        metrics["bleu_3"][method].append(bleu_3)
        metrics["bleu_4"][method].append(bleu_4)
        
        # ROUGE-L
        metrics["rouge_l"][method].append(calculate_rouge_l_score(ref, cand))
        
        # METEOR
        metrics["meteor"][method].append(calculate_meteor_score(ref, cand))
        
        # Distinct-1 and Distinct-2
        dist_1, dist_2 = calculate_distinct_scores(cand)
        metrics["dist_1"][method].append(dist_1)
        metrics["dist_2"][method].append(dist_2)

# Calculate mean and error bars
final_metrics = {}
for metric, values in metrics.items():
    final_metrics[metric] = {}
    for method, scores in values.items():
        final_metrics[metric][method] = {
            "mean": np.mean(scores),
            "std_err": np.std(scores) / np.sqrt(len(scores))
        }

In [None]:
# Print results
for method in generated_responses:
    print(f"Method: {method}")
    print(f"BLEU-1: {final_metrics['bleu_1'][method]['mean']} ± {final_metrics['bleu_1'][method]['std_err']}")
    print(f"BLEU-2: {final_metrics['bleu_2'][method]['mean']} ± {final_metrics['bleu_2'][method]['std_err']}")
    print(f"BLEU-3: {final_metrics['bleu_3'][method]['mean']} ± {final_metrics['bleu_3'][method]['std_err']}")
    print(f"BLEU-4: {final_metrics['bleu_4'][method]['mean']} ± {final_metrics['bleu_4'][method]['std_err']}")
    print(f"ROUGE-L: {final_metrics['rouge_l'][method]['mean']} ± {final_metrics['rouge_l'][method]['std_err']}")
    print(f"METEOR: {final_metrics['meteor'][method]['mean']} ± {final_metrics['meteor'][method]['std_err']}")
    print(f"Distinct-1: {final_metrics['dist_1'][method]['mean']} ± {final_metrics['dist_1'][method]['std_err']}")
    print(f"Distinct-2: {final_metrics['dist_2'][method]['mean']} ± {final_metrics['dist_2'][method]['std_err']}")
    print("="*80)