In [None]:
!git config --global user.name "nishanth-6"

In [None]:
!git clone https://github.com/Sschittala/421_Project_UIC

Cloning into '421_Project_UIC'...
remote: Enumerating objects: 168, done.[K
remote: Counting objects: 100% (168/168), done.[K
remote: Compressing objects: 100% (138/138), done.[K
remote: Total 168 (delta 66), reused 102 (delta 24), pack-reused 0 (from 0)[K
Receiving objects: 100% (168/168), 26.40 MiB | 16.98 MiB/s, done.
Resolving deltas: 100% (66/66), done.
Updating files: 100% (44/44), done.


In [None]:
!pip install sentence-transformers rouge-score sacrebleu bert-score scikit-learn tqdm

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownload

In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
from bert_score import score as bert_score
import warnings
import os
from tqdm import tqdm

warnings.filterwarnings('ignore')

output_dir = '/content/421_Project_UIC/Part_2_Project/outputs'

print("Libraries imported and paths set")

# Verify GPU
print("GPU Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Device:", torch.cuda.get_device_name(0))

Libraries imported and paths set
GPU Available: True
GPU Device: Tesla T4


In [None]:
pwd


'/content'

In [None]:
def load_data(train_path, dev_path, test_path):
    """Load training, development, and test datasets"""
    print("Loading datasets...")
    train_df = pd.read_csv(train_path, on_bad_lines='skip', engine='python')
    dev_df = pd.read_csv(dev_path, on_bad_lines='skip', engine='python')
    test_df = pd.read_csv(test_path, on_bad_lines='skip', engine='python')

    print(f"Train set: {len(train_df)} samples")
    print(f"Dev set: {len(dev_df)} samples")
    print(f"Test set: {len(test_df)} samples")
    print(f"Train columns: {train_df.columns.tolist()}")

    return train_df, dev_df, test_df

# Test loading
TRAIN_PATH = '/content/421_Project_UIC/P1_DATA/trac2_CONVT_train.csv'
DEV_PATH = '/content/421_Project_UIC/P1_DATA/trac2_CONVT_dev.csv'
TEST_PATH = '/content/421_Project_UIC/P1_DATA/trac2_CONVT_test.csv'

train_df, dev_df, test_df = load_data(TRAIN_PATH, DEV_PATH, TEST_PATH)
print("\nFirst few rows of training data:")
print(train_df.head())

Loading datasets...
Train set: 11090 samples
Dev set: 965 samples
Test set: 2294 samples
Train columns: ['id', 'article_id', 'conversation_id', 'turn_id', 'speaker', 'text', 'person_id_1', 'person_id_2', 'Emotion', 'EmotionalPolarity', 'Empathy', 'SelfDisclosure']

First few rows of training data:
   id  article_id  conversation_id  turn_id   speaker  \
0   0          35                1        0  Person 1   
1   1          35                1        1  Person 2   
2   2          35                1        2  Person 1   
3   3          35                1        3  Person 2   
4   4          35                1        4  Person 1   

                                                text person_id_1 person_id_2  \
0              what did you think about this article        p019        p012   
1  It's definitely really sad to read, considerin...        p019        p012   
2  I think it's super sad... they seem to never c...        p019        p012   
3  I can't imagine just living in an a

In [None]:
print("Train columns:", train_df.columns.tolist())
print("Dev columns:", dev_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

Train columns: ['id', 'article_id', 'conversation_id', 'turn_id', 'speaker', 'text', 'person_id_1', 'person_id_2', 'Emotion', 'EmotionalPolarity', 'Empathy', 'SelfDisclosure']
Dev columns: ['id', 'article_id', 'conversation_id', 'turn_id', 'speaker_id', 'text', 'person_id', 'person_id_1', 'person_id_2', 'Emotion', 'EmotionalPolarity', 'Empathy', 'SelfDisclosure']
Test columns: ['id', 'article_id', 'conversation_id', 'turn_id', 'speaker_id', 'text', 'person_id', 'person_id_1', 'person_id_2']


In [None]:
def normalize_scores(df):
    """
    Normalize emotion intensity and empathy scores to [0, 1]
    Only applies to train/dev sets that have these columns
    """
    df_processed = df.copy()

    # Only normalize if columns exist (train/dev have them, test doesn't)
    if 'Emotion' in df_processed.columns:
        min_emotion = df_processed['Emotion'].min()
        max_emotion = df_processed['Emotion'].max()
        df_processed['Emotion_normalized'] = (df_processed['Emotion'] - min_emotion) / (max_emotion - min_emotion)

    if 'Empathy' in df_processed.columns:
        min_empathy = df_processed['Empathy'].min()
        max_empathy = df_processed['Empathy'].max()
        df_processed['Empathy_normalized'] = (df_processed['Empathy'] - min_empathy) / (max_empathy - min_empathy)

    print(f"Normalized - Columns available: {df_processed.columns.tolist()}")

    return df_processed

# Apply normalization
train_processed = normalize_scores(train_df)
dev_processed = normalize_scores(dev_df)
test_processed = normalize_scores(test_df)

print("\n✓ All datasets processed")

Normalized - Columns available: ['id', 'article_id', 'conversation_id', 'turn_id', 'speaker', 'text', 'person_id_1', 'person_id_2', 'Emotion', 'EmotionalPolarity', 'Empathy', 'SelfDisclosure', 'Emotion_normalized', 'Empathy_normalized']
Normalized - Columns available: ['id', 'article_id', 'conversation_id', 'turn_id', 'speaker_id', 'text', 'person_id', 'person_id_1', 'person_id_2', 'Emotion', 'EmotionalPolarity', 'Empathy', 'SelfDisclosure', 'Emotion_normalized', 'Empathy_normalized']
Normalized - Columns available: ['id', 'article_id', 'conversation_id', 'turn_id', 'speaker_id', 'text', 'person_id', 'person_id_1', 'person_id_2']

✓ All datasets processed


In [None]:
def generate_embeddings(texts, model_name='all-MiniLM-L6-v2', batch_size=64):
    """
    Generate embeddings for all utterances using SentenceTransformer
    Uses GPU for faster computation
    """
    print(f"\nGenerating embeddings for {len(texts)} utterances...")
    print(f"Using model: {model_name}")

    model = SentenceTransformer(model_name)
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

    embeddings = model.encode(
        texts.tolist(),
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )

    print(f"Embeddings shape: {embeddings.shape}")
    return embeddings, model

# Generate embeddings for training corpus
print("This may take a few minutes on first run...")
train_embeddings, model = generate_embeddings(train_processed['text'])

This may take a few minutes on first run...

Generating embeddings for 11090 utterances...
Using model: all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/174 [00:00<?, ?it/s]

Embeddings shape: (11090, 384)


In [None]:
# Extract and prepare corpus data
train_emotions = train_processed['Emotion_normalized'].values
train_empathies = train_processed['Empathy_normalized'].values
train_polarities = train_processed['EmotionalPolarity'].values
train_texts = train_processed['text']

# Get ALL unique polarity values from train + dev + test combined
all_polarities = pd.concat([
    train_processed['EmotionalPolarity'],
    dev_processed['EmotionalPolarity'],
    test_processed['EmotionalPolarity'] if 'EmotionalPolarity' in test_processed.columns else pd.Series([])
]).unique()
unique_polarities = sorted(all_polarities)
print(f"All unique polarity values: {unique_polarities}")

# Create mapping from polarity value to one-hot vector
def polarity_to_onehot(polarity_value, unique_vals):
    """Convert single polarity value to one-hot vector"""
    onehot = np.zeros(len(unique_vals))
    try:
        idx = list(unique_vals).index(float(polarity_value))
        onehot[idx] = 1
    except:
        # If value not found, use first polarity as default
        onehot[0] = 1
    return onehot

# Pre-compute one-hot vectors for training corpus
train_polarities_onehot = np.array([polarity_to_onehot(p, unique_polarities) for p in train_polarities])

print(f"Corpus size: {len(train_embeddings)}")
print(f"Embedding dimension: {train_embeddings.shape[1]}")
print(f"One-hot polarity shape: {train_polarities_onehot.shape}")
print(f"Sample one-hot polarities:\n{train_polarities_onehot[:5]}")

All unique polarity values: [np.float64(0.0), np.float64(0.5), np.float64(1.0), np.float64(1.5), np.float64(2.0), np.float64(3.0)]
Corpus size: 11090
Embedding dimension: 384
One-hot polarity shape: (11090, 6)
Sample one-hot polarities:
[[0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]]


In [None]:
def calculate_similarity(
    query_embedding,
    corpus_embeddings,
    query_emotion,
    corpus_emotions,
    query_empathy,
    corpus_empathies,
    query_polarity_onehot,
    corpus_polarities_onehot,
    w1=0.4,
    w2=0.2,
    w3=0.2,
    w4=0.2
):
    """
    Calculate combined similarity score using four components:
    - Text similarity (cosine) - equation (1)
    - Emotion intensity similarity - equation (2)
    - Empathy similarity - equation (3)
    - Polarity match - equation (4)

    Combined using equation (5) with weights w1, w2, w3, w4
    """

    # (1) Text similarity: cos(e1, e2)
    query_emb_reshaped = query_embedding.reshape(1, -1)
    s_text = cosine_similarity(query_emb_reshaped, corpus_embeddings)[0]

    # (2) Emotion similarity: 1 - |EI1 - EI2|
    s_emotion = 1 - np.abs(query_emotion - corpus_emotions)

    # (3) Empathy similarity: 1 - |Emp1 - Emp2|
    s_empathy = 1 - np.abs(query_empathy - corpus_empathies)

    # (4) Polarity similarity: 1 if P1 = P2, else 0
    # Using one-hot vectors: dot product will be 1 if both have same polarity, 0 otherwise
    s_polarity = np.array([np.dot(query_polarity_onehot, corpus_onehot)
                           for corpus_onehot in corpus_polarities_onehot])

    # (5) Total weighted similarity
    s_total = (w1 * s_text +
               w2 * s_emotion +
               w3 * s_empathy +
               w4 * s_polarity)

    return s_total, s_text, s_emotion, s_empathy, s_polarity

print("Similarity calculation function defined")

Similarity calculation function defined


In [None]:
def retrieve_best_response(
    query_embedding,
    query_emotion,
    query_empathy,
    query_polarity,
    corpus_embeddings,
    corpus_texts,
    corpus_emotions,
    corpus_empathies,
    corpus_polarities_onehot,
    unique_polarities,
    weights=(0.4, 0.2, 0.2, 0.2)
):
    """
    Find the most similar utterance from the corpus
    """
    # Convert query polarity to one-hot
    query_polarity_onehot = polarity_to_onehot(query_polarity, unique_polarities)

    # Calculate similarity
    s_total, _, _, _, _ = calculate_similarity(
        query_embedding,
        corpus_embeddings,
        query_emotion,
        corpus_emotions,
        query_empathy,
        corpus_empathies,
        query_polarity_onehot,
        corpus_polarities_onehot,
        w1=weights[0],
        w2=weights[1],
        w3=weights[2],
        w4=weights[3]
    )

    # Get best match
    best_idx = np.argmax(s_total)
    best_score = s_total[best_idx]
    best_response = corpus_texts.iloc[best_idx]

    return best_response, best_score, best_idx

print("Retrieval function defined")

Retrieval function defined


In [None]:
def generate_conversation(
    conversation_data,
    corpus_embeddings,
    corpus_texts,
    corpus_emotions,
    corpus_empathies,
    corpus_polarities_onehot,
    model,
    unique_polarities,
    num_turns_to_generate=5,
    start_turn=6,
    weights=(0.4, 0.2, 0.2, 0.2)
):
    """
    Generate next num_turns_to_generate utterances starting from start_turn
    for a given conversation
    """
    generated_responses = []
    conversation_history = []

    # Get initial history (turns 0 to start_turn-1)
    for idx in range(start_turn):
        if idx < len(conversation_data):
            conversation_history.append(conversation_data.iloc[idx]['text'])

    # Generate next turns
    for turn_num in range(start_turn, start_turn + num_turns_to_generate):
        # Concatenate history as query
        history_text = " ".join(conversation_history)

        # Generate embedding for query
        query_embedding = model.encode(history_text, convert_to_numpy=True)

        # Get desired scores for this turn
        # Check if normalized columns exist (dev has them, test doesn't)
        if 'Emotion_normalized' in conversation_data.columns and turn_num < len(conversation_data):
            query_emotion = conversation_data.iloc[turn_num]['Emotion_normalized']
            query_empathy = conversation_data.iloc[turn_num]['Empathy_normalized']
            query_polarity = conversation_data.iloc[turn_num]['EmotionalPolarity']
        else:
            # Use default values for test set (which has no ground truth scores)
            query_emotion = 0.5
            query_empathy = 0.5
            query_polarity = unique_polarities[0]

        # Retrieve best response
        best_response, score, _ = retrieve_best_response(
            query_embedding,
            query_emotion,
            query_empathy,
            query_polarity,
            corpus_embeddings,
            corpus_texts,
            corpus_emotions,
            corpus_empathies,
            corpus_polarities_onehot,
            unique_polarities,
            weights
        )

        generated_responses.append({
            'turn': turn_num,
            'generated_text': best_response,
            'similarity_score': score
        })

        # Add to history for next turn
        conversation_history.append(best_response)

    return generated_responses

print("Conversation generation function defined (updated)")

Conversation generation function defined (updated)


In [None]:
class MetricsCalculator:
    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def calculate_rouge(self, reference, hypothesis):
        """Calculate ROUGE-1, ROUGE-2, ROUGE-L"""
        try:
            scores = self.rouge_scorer.score(reference, hypothesis)
            return {
                'rouge1': scores['rouge1'].fmeasure,
                'rouge2': scores['rouge2'].fmeasure,
                'rougeL': scores['rougeL'].fmeasure
            }
        except:
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

    def calculate_bleu(self, reference, hypothesis):
        """Calculate BLEU score"""
        try:
            ref_tokens = reference.split()
            hyp_tokens = hypothesis.split()
            bleu_score = sentence_bleu([ref_tokens], hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25))
            return float(bleu_score)
        except:
            return 0.0

    def calculate_bertscore(self, reference, hypothesis):
        """Calculate BERTScore"""
        try:
            P, R, F1 = bert_score([hypothesis], [reference], lang="en", verbose=False)
            return {'precision': P.item(), 'recall': R.item(), 'f1': F1.item()}
        except:
            return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

metrics_calc = MetricsCalculator()
print("Metrics calculator defined")

Metrics calculator defined


In [None]:
print("Generating conversations on dev set...")

# Group by conversation_id
dev_conversations_grouped = dev_processed.groupby('conversation_id')

all_generations = []
weights = (0.4, 0.2, 0.2, 0.2)  # TUNE THESE WEIGHTS

count = 0
for conv_id, conv_data in tqdm(dev_conversations_grouped, desc="Processing conversations"):
    conv_data_sorted = conv_data.sort_values('turn_id').reset_index(drop=True)

    # Generate responses for turns 6-10 (5 turns)
    generated = generate_conversation(
        conv_data_sorted,
        train_embeddings,
        train_texts,
        train_emotions,
        train_empathies,
        train_polarities_onehot,
        model,
        unique_polarities,
        num_turns_to_generate=5,
        start_turn=6,
        weights=weights
    )

    # Store generations
    for gen in generated:
        all_generations.append({
            'conversation_id': conv_id,
            'turn': gen['turn'],
            'generated_text': gen['generated_text'],
            'similarity_score': gen['similarity_score']
        })

    count += 1

print(f"\nTotal generations: {len(all_generations)}")
print("Sample generations:")
gen_df_sample = pd.DataFrame(all_generations[:10])
print(gen_df_sample[['conversation_id', 'turn', 'generated_text']])

Generating conversations on dev set...


Processing conversations: 100%|██████████| 33/33 [00:09<00:00,  3.49it/s]


Total generations: 165
Sample generations:
   conversation_id  turn                                     generated_text
0               68     6  I would do anything to protect my family, but ...
1               68     7  Going back to what you said earlier. I feel ba...
2               68     8  I would do anything to protect my family, but ...
3               68     9  I would do anything to protect my family, but ...
4               68    10  I would do anything to protect my family, but ...
5               72     6  No, the article I read only mentioned one 70 y...
6               72     7  No, the article I read only mentioned one 70 y...
7               72     8  No, the article I read only mentioned one 70 y...
8               72     9  http://worldpopulationreview.com/countries/mas...
9               72    10  It is very much. I am afraid we're going to ke...





In [None]:
print("Evaluating on dev set...")

metrics_results = []

for idx, row in tqdm(pd.DataFrame(all_generations).iterrows(), total=len(all_generations)):
    conv_id = row['conversation_id']
    turn = row['turn']
    generated = row['generated_text']

    # Get reference from dev set
    ref_row = dev_processed[(dev_processed['conversation_id'] == conv_id) &
                           (dev_processed['turn_id'] == turn)]

    if len(ref_row) > 0:
        reference = ref_row.iloc[0]['text']

        # Calculate metrics
        rouge = metrics_calc.calculate_rouge(reference, generated)
        bleu = metrics_calc.calculate_bleu(reference, generated)
        bertscore = metrics_calc.calculate_bertscore(reference, generated)

        metrics_results.append({
            'conversation_id': conv_id,
            'turn': turn,
            'rouge1': rouge['rouge1'],
            'rouge2': rouge['rouge2'],
            'rougeL': rouge['rougeL'],
            'bleu': bleu,
            'bertscore_f1': bertscore['f1']
        })

metrics_df = pd.DataFrame(metrics_results)
print("\nDev Set Metrics Summary:")
print(metrics_df[['rouge1', 'rouge2', 'rougeL', 'bleu', 'bertscore_f1']].describe())

Evaluating on dev set...


  0%|          | 0/165 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 1/165 [00:02<06:47,  2.49s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 2/165 [00:04<06:44,  2.48s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 3/165 [00:07<07:15,  


Dev Set Metrics Summary:
           rouge1      rouge2      rougeL   bleu  bertscore_f1
count  161.000000  161.000000  161.000000  161.0    161.000000
mean     0.128082    0.015709    0.099583    0.0      0.848391
std      0.099707    0.047528    0.081549    0.0      0.018666
min      0.000000    0.000000    0.000000    0.0      0.787301
25%      0.052632    0.000000    0.046512    0.0      0.836609
50%      0.117647    0.000000    0.088889    0.0      0.848577
75%      0.190476    0.000000    0.142857    0.0      0.859727
max      0.526316    0.470588    0.526316    0.0      0.916032





In [None]:
print("Generating test predictions...")

test_conversations_grouped = test_processed.groupby('conversation_id')
test_generations = []

for conv_id, conv_data in tqdm(test_conversations_grouped, desc="Generating test predictions"):
    conv_data_sorted = conv_data.sort_values('turn_id').reset_index(drop=True)

    generated = generate_conversation(
        conv_data_sorted,
        train_embeddings,
        train_texts,
        train_emotions,
        train_empathies,
        train_polarities_onehot,
        model,
        unique_polarities,
        num_turns_to_generate=5,
        start_turn=6,
        weights=weights
    )

    for gen in generated:
        test_generations.append({
            'id': conv_id,
            'turn_number': gen['turn'],
            'generated_response': gen['generated_text']
        })

test_df_output = pd.DataFrame(test_generations)
print(f"\nTotal test generations: {len(test_df_output)}")
print("First few test predictions:")
print(test_df_output.head(10))

Generating test predictions...


Generating test predictions: 100%|██████████| 67/67 [00:18<00:00,  3.63it/s]


Total test generations: 335
First few test predictions:
    id  turn_number                                 generated_response
0  393            6  I think they should do a telethon get a bunch ...
1  393            7  I think they should do a telethon get a bunch ...
2  393            8  I think they should do a telethon get a bunch ...
3  393            9  I think they should do a telethon get a bunch ...
4  393           10  I think they should do a telethon get a bunch ...
5  394            6  I'm happy they waited for the children to be o...
6  394            7  I'm happy they waited for the children to be o...
7  394            8  I'm happy they waited for the children to be o...
8  394            9  I'm happy they waited for the children to be o...
9  394           10  I'm happy they waited for the children to be o...





In [None]:
# Create output directory
output_dir = '/content/421_Project_UIC/Part_2_Project/outputs'
os.makedirs(output_dir, exist_ok=True)

# Save dev metrics
metrics_df.to_csv(f'{output_dir}/dev_metrics.csv', index=False)
print(f"✓ Dev metrics saved to {output_dir}/dev_metrics.csv")

# Save test predictions (REQUIRED FOR SUBMISSION)
test_df_output.to_csv(f'{output_dir}/generations_corpus.csv', index=False)
print(f"✓ Test predictions saved to {output_dir}/generations_corpus.csv")

# Save dev generations for analysis
dev_gen_df = pd.DataFrame(all_generations)
dev_gen_df.to_csv(f'{output_dir}/dev_generations.csv', index=False)
print(f"✓ Dev generations saved to {output_dir}/dev_generations.csv")

print("\n" + "="*80)
print("✓✓✓ Q1 CORPUS-BASED CHATBOT COMPLETED ✓✓✓")
print("="*80)
print(f"\nKey metrics:")
print(f"  - ROUGE-1: {metrics_df['rouge1'].mean():.4f}")
print(f"  - ROUGE-2: {metrics_df['rouge2'].mean():.4f}")
print(f"  - ROUGE-L: {metrics_df['rougeL'].mean():.4f}")
print(f"  - BLEU: {metrics_df['bleu'].mean():.4f}")
print(f"  - BERTScore F1: {metrics_df['bertscore_f1'].mean():.4f}")
print(f"\nWeights used: {weights}")

✓ Dev metrics saved to /content/421_Project_UIC/Part_2_Project/outputs/dev_metrics.csv
✓ Test predictions saved to /content/421_Project_UIC/Part_2_Project/outputs/generations_corpus.csv
✓ Dev generations saved to /content/421_Project_UIC/Part_2_Project/outputs/dev_generations.csv

✓✓✓ Q1 CORPUS-BASED CHATBOT COMPLETED ✓✓✓

Key metrics:
  - ROUGE-1: 0.1281
  - ROUGE-2: 0.0157
  - ROUGE-L: 0.0996
  - BLEU: 0.0000
  - BERTScore F1: 0.8484

Weights used: (0.4, 0.2, 0.2, 0.2)


In [None]:
print("SAMPLE GENERATION ANALYSIS\n")
sample_conv_id = all_generations[0]['conversation_id']
sample_gens = [g for g in all_generations if g['conversation_id'] == sample_conv_id]

print(f"Conversation ID: {sample_conv_id}")
print(f"Generated {len(sample_gens)} turns:")
for gen in sample_gens:
    print(f"\n  Turn {gen['turn']}:")
    print(f"    Text: {gen['generated_text'][:100]}...")
    print(f"    Similarity: {gen['similarity_score']:.4f}")

SAMPLE GENERATION ANALYSIS

Conversation ID: 68
Generated 5 turns:

  Turn 6:
    Text: I would do anything to protect my family, but I do not  go looking for trouble. I'm glad you can see...
    Similarity: 0.8191

  Turn 7:
    Text: Going back to what you said earlier. I feel bad for anybody family that has to go though this....
    Similarity: 0.7896

  Turn 8:
    Text: I would do anything to protect my family, but I do not  go looking for trouble. I'm glad you can see...
    Similarity: 0.8531

  Turn 9:
    Text: I would do anything to protect my family, but I do not  go looking for trouble. I'm glad you can see...
    Similarity: 0.8698

  Turn 10:
    Text: I would do anything to protect my family, but I do not  go looking for trouble. I'm glad you can see...
    Similarity: 0.8140


**Qualitative Evaluation using Q1 corpus model outputs**

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import os

output_dir = '/content/421_Project_UIC/Part_2_Project/outputs'
print("Libraries imported")

Libraries imported


In [None]:
train_df = pd.read_csv('/content/421_Project_UIC/P1_DATA/trac2_CONVT_train.csv',
                        on_bad_lines='skip', engine='python')
dev_df = pd.read_csv('/content/421_Project_UIC/P1_DATA/trac2_CONVT_dev.csv',
                       on_bad_lines='skip', engine='python')

def normalize_scores(df):
    df_processed = df.copy()
    if 'Emotion' in df_processed.columns:
        min_emotion = df_processed['Emotion'].min()
        max_emotion = df_processed['Emotion'].max()
        df_processed['Emotion_normalized'] = (df_processed['Emotion'] - min_emotion) / (max_emotion - min_emotion)
    if 'Empathy' in df_processed.columns:
        min_empathy = df_processed['Empathy'].min()
        max_empathy = df_processed['Empathy'].max()
        df_processed['Empathy_normalized'] = (df_processed['Empathy'] - min_empathy) / (max_empathy - min_empathy)
    return df_processed

dev_processed = normalize_scores(dev_df)
print(f"Dev set loaded: {len(dev_processed)} samples")

Dev set loaded: 965 samples


In [None]:
model_name = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MultiTaskModel(nn.Module):
    def __init__(self, model_name, polarities):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size
        dropout = self.bert.config.hidden_dropout_prob
        self.drop = nn.Dropout(dropout)
        self.emotion_classifier = nn.Linear(hidden_size, 1)
        self.polarity_classifier = nn.Linear(hidden_size, polarities)
        self.empathy = nn.Linear(hidden_size, 1)

    def forward(self, input_id, attention_mask):
        outputs = self.bert(input_ids=input_id, attention_mask=attention_mask)
        cls_output = self.drop(outputs.last_hidden_state[:, 0, :])
        emotion = self.emotion_classifier(cls_output).squeeze(-1)
        polarity = self.polarity_classifier(cls_output)
        empathy = self.empathy(cls_output).squeeze(-1)
        return emotion, polarity, empathy

print("Loading BERT model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_polar_labels = len(sorted(dev_processed['EmotionalPolarity'].unique()))
bert_model = MultiTaskModel(model_name, num_polar_labels).to(device)
print("BERT model loaded")

Loading BERT model...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT model loaded


In [None]:
def denormalize_predictions(emotion_pred, empathy_pred):
    emotion_pred = np.clip(emotion_pred, 1, 5)
    empathy_pred = np.clip(empathy_pred, 1, 5)
    return emotion_pred, empathy_pred

def get_bert_predictions(texts):
    tokenizer_output = tokenizer(
        list(texts),
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )

    input_ids = tokenizer_output["input_ids"].to(device)
    attention_mask = tokenizer_output["attention_mask"].to(device)

    bert_model.eval()
    with torch.no_grad():
        emotion_logits, polarity_logits, empathy_logits = bert_model(input_ids, attention_mask)
        emotion_preds = emotion_logits.cpu().numpy()
        polarity_preds = torch.argmax(polarity_logits, dim=1).cpu().numpy()
        empathy_preds = empathy_logits.cpu().numpy()

    emotion_preds_denorm = np.array([denormalize_predictions(e, emp)[0] for e, emp in zip(emotion_preds, empathy_preds)])
    empathy_preds_denorm = np.array([denormalize_predictions(e, emp)[1] for e, emp in zip(emotion_preds, empathy_preds)])

    return emotion_preds_denorm, polarity_preds, empathy_preds_denorm

print("Functions defined")

Functions defined


In [None]:
all_generations_df = pd.read_csv(f'{output_dir}/dev_generations.csv')
all_generations = all_generations_df.to_dict('records')
print("Generating Q3 analysis with CORRECTED BERT predictions...\n")

selected_conv_ids = [68, 72, 74, 80, 85]
q3_results = []

for conv_id in selected_conv_ids:
    print(f"\n{'='*80}")
    print(f"CONVERSATION {conv_id}")
    print(f"{'='*80}")

    corpus_gens = [g for g in all_generations if g['conversation_id'] == conv_id]
    corpus_gens = sorted(corpus_gens, key=lambda x: x['turn'])

    gold_data = dev_processed[dev_processed['conversation_id'] == conv_id].sort_values('turn_id')

    for turn_num in range(6, 11):
        corpus_text = next((g['generated_text'] for g in corpus_gens if g['turn'] == turn_num), None)

        if corpus_text is None:
            continue

        gold_row = gold_data[gold_data['turn_id'] == turn_num]
        if len(gold_row) == 0:
            continue

        gold_row = gold_row.iloc[0]

        emo_pred, pol_pred, emp_pred = get_bert_predictions([corpus_text])

        result = {
            'conversation_id': conv_id,
            'turn': turn_num,
            'generated_text': corpus_text[:100],
            'gold_emotion': gold_row['Emotion'],
            'pred_emotion': emo_pred[0],
            'gold_polarity': gold_row['EmotionalPolarity'],
            'pred_polarity': pol_pred[0],
            'gold_empathy': gold_row['Empathy'],
            'pred_empathy': emp_pred[0]
        }

        q3_results.append(result)

        print(f"\nTurn {turn_num}:")
        print(f"  Emotion:  Gold={gold_row['Emotion']:.1f}, Pred={emo_pred[0]:.2f}")
        print(f"  Polarity: Gold={int(gold_row['EmotionalPolarity'])}, Pred={int(pol_pred[0])}")
        print(f"  Empathy:  Gold={gold_row['Empathy']:.1f}, Pred={emp_pred[0]:.2f}")

q3_df = pd.DataFrame(q3_results)
q3_df.to_csv(f'{output_dir}/q3_corpus_analysis_CORRECTED.csv', index=False)

print(f"\n✓ Q3 corrected analysis saved")

Generating Q3 analysis with CORRECTED BERT predictions...


CONVERSATION 68

Turn 6:
  Emotion:  Gold=2.0, Pred=1.00
  Polarity: Gold=1, Pred=2
  Empathy:  Gold=3.0, Pred=1.00

Turn 7:
  Emotion:  Gold=2.0, Pred=1.00
  Polarity: Gold=1, Pred=2
  Empathy:  Gold=3.0, Pred=1.00

Turn 8:
  Emotion:  Gold=3.0, Pred=1.00
  Polarity: Gold=2, Pred=2
  Empathy:  Gold=4.0, Pred=1.00

Turn 9:
  Emotion:  Gold=3.0, Pred=1.00
  Polarity: Gold=1, Pred=2
  Empathy:  Gold=3.0, Pred=1.00

Turn 10:
  Emotion:  Gold=2.0, Pred=1.00
  Polarity: Gold=1, Pred=2
  Empathy:  Gold=2.0, Pred=1.00

CONVERSATION 72

Turn 6:
  Emotion:  Gold=1.0, Pred=1.00
  Polarity: Gold=2, Pred=2
  Empathy:  Gold=2.0, Pred=1.00

Turn 7:
  Emotion:  Gold=1.0, Pred=1.00
  Polarity: Gold=1, Pred=2
  Empathy:  Gold=1.0, Pred=1.00

Turn 8:
  Emotion:  Gold=1.0, Pred=1.00
  Polarity: Gold=1, Pred=2
  Empathy:  Gold=1.0, Pred=1.00

Turn 9:
  Emotion:  Gold=2.0, Pred=1.00
  Polarity: Gold=1, Pred=2
  Empathy:  Gold=1.0, Pred=1.00

Turn 