In [None]:
print("Installing packages...")
!pip install -q transformers torch pandas scikit-learn matplotlib tqdm
print("Installed!\n")

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt


DATASET_PATH = '/content/drive/MyDrive/MERGE_Lyrics_Complete'

BATCH_SIZE = 16
NUM_EPOCHS = 30
PATIENCE = 7
DROPOUT_RATE = 0.3
LEARNING_RATE_ROBERTA = 1e-5
LEARNING_RATE_CUSTOM = 1e-3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}\n")

from google.colab import drive
drive.mount('/content/drive')
print(f"\nDrive mounted")
print(f"Looking for dataset at: {DATASET_PATH}")

if not os.path.exists(DATASET_PATH):
    print("Dataset not found! Update DATASET_PATH variable.")
    raise FileNotFoundError(f"Cannot find {DATASET_PATH}")

print(f"Dataset found!\n")

print("="*60)
print("LOADING DATA")
print("="*60)

av_path = os.path.join(DATASET_PATH, 'merge_lyrics_complete_av_values.csv')
av_df = pd.read_csv(av_path)
print(f"Loaded {len(av_df)} songs")

ΩΩΩprint(f"Original ranges: Arousal [{av_df['Arousal'].min():.3f}, {av_df['Arousal'].max():.3f}], Valence [{av_df['Valence'].min():.3f}, {av_df['Valence'].max():.3f}]")
av_df['Arousal'] = 2 * av_df['Arousal'] - 1
av_df['Valence'] = 2 * av_df['Valence'] - 1
print(f"Normalized ranges: Arousal [{av_df['Arousal'].min():.3f}, {av_df['Arousal'].max():.3f}], Valence [{av_df['Valence'].min():.3f}, {av_df['Valence'].max():.3f}]")

def read_lyrics(song_id, base_path):
    for q in ['Q1', 'Q2', 'Q3', 'Q4']:
        path = os.path.join(base_path, q, f'{song_id}.txt')
        if os.path.exists(path):
            with open(path, 'r', encoding='utf-8') as f:
                return f.read().strip()
    return None

print("\nReading lyrics files...")
data = []
for _, row in tqdm(av_df.iterrows(), total=len(av_df)):
    lyrics = read_lyrics(row['Song'], DATASET_PATH)
    if lyrics and len(lyrics) > 10:
        data.append({
            'song_id': row['Song'],
            'lyrics': lyrics,
            'valence': row['Valence'],
            'arousal': row['Arousal']
        })

df = pd.DataFrame(data)
print(f"Loaded {len(df)} songs with lyrics")
print(f"Verify normalization - Arousal: [{df['arousal'].min():.3f}, {df['arousal'].max():.3f}], Valence: [{df['valence'].min():.3f}, {df['valence'].max():.3f}]\n")

splits_dir = os.path.join(DATASET_PATH, 'tvt_dataframes', 'tvt_70_15_15')
train_ids = set(pd.read_csv(os.path.join(splits_dir, 'tvt_70_15_15_train_lyrics_complete.csv'))['Song'].values)
val_ids = set(pd.read_csv(os.path.join(splits_dir, 'tvt_70_15_15_validate_lyrics_complete.csv'))['Song'].values)
test_ids = set(pd.read_csv(os.path.join(splits_dir, 'tvt_70_15_15_test_lyrics_complete.csv'))['Song'].values)

train_df = df[df['song_id'].isin(train_ids)].reset_index(drop=True)
val_df = df[df['song_id'].isin(val_ids)].reset_index(drop=True)
test_df = df[df['song_id'].isin(test_ids)].reset_index(drop=True)

print(f"Splits: {len(train_df)} train, {len(val_df)} val, {len(test_df)} test\n")

class LyricsEmotionRegressor(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 256)
        self.embedding_projection = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 2)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

    def forward(self, input_ids, attention_mask, return_embeddings=False):
        bert_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        features = bert_out.last_hidden_state[:, 0, :]

        h = self.relu(self.fc1(features))
        h = self.dropout(h)
        h = self.relu(self.fc2(h))
        h = self.dropout(h)

        embeddings = self.relu(self.embedding_projection(h))

        output = self.tanh(self.fc3(embeddings))

        return (output, embeddings) if return_embeddings else output

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class MERGEDataset(Dataset):
    def __init__(self, dataframe):
        self.lyrics = dataframe['lyrics'].values
        self.valence = dataframe['valence'].values
        self.arousal = dataframe['arousal'].values

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, idx):
        encoding = tokenizer(
            str(self.lyrics[idx]),
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target = torch.tensor([self.valence[idx], self.arousal[idx]], dtype=torch.float)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'target': target
        }

train_dataset = MERGEDataset(train_df)
val_dataset = MERGEDataset(val_df)
test_dataset = MERGEDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

print(f"DataLoaders ready: {len(train_loader)} train batches\n")

print("="*60)
print("CREATING MODEL")
print("="*60)

model = LyricsEmotionRegressor(dropout_rate=DROPOUT_RATE).to(device)
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

optimizer = optim.Adam([
    {'params': model.roberta.parameters(), 'lr': LEARNING_RATE_ROBERTA},
    {'params': model.fc1.parameters(), 'lr': LEARNING_RATE_CUSTOM},
    {'params': model.fc2.parameters(), 'lr': LEARNING_RATE_CUSTOM},
    {'params': model.embedding_projection.parameters(), 'lr': LEARNING_RATE_CUSTOM},
    {'params': model.fc3.parameters(), 'lr': LEARNING_RATE_CUSTOM}
])

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
criterion = nn.MSELoss()

print("Model and optimizer created\n")

print("="*60)
print("TRAINING")
print("="*60)

best_val_loss = float('inf')
patience_counter = 0
train_losses, val_losses = [], []
val_mae_v_list, val_mae_a_list = [], []

for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{NUM_EPOCHS}', leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        predictions = model(input_ids, attention_mask)
        loss = criterion(predictions, targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    model.eval()
    val_loss = 0
    val_preds, val_targets = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            predictions = model(input_ids, attention_mask)
            loss = criterion(predictions, targets)

            val_loss += loss.item()
            val_preds.append(predictions.cpu())
            val_targets.append(targets.cpu())

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    val_preds = torch.cat(val_preds)
    val_targets = torch.cat(val_targets)
    mae_v = torch.mean(torch.abs(val_preds[:, 0] - val_targets[:, 0])).item()
    mae_a = torch.mean(torch.abs(val_preds[:, 1] - val_targets[:, 1])).item()
    val_mae_v_list.append(mae_v)
    val_mae_a_list.append(mae_a)

    scheduler.step(val_loss)

    print(f'Epoch {epoch+1}: Train={train_loss:.4f}, Val={val_loss:.4f}, MAE V={mae_v:.4f} A={mae_a:.4f}', end='')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_epoch = epoch + 1
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print(' BEST')
    else:
        patience_counter += 1
        print()
        if patience_counter >= PATIENCE:
            print(f'\nEarly stopping at epoch {epoch+1}')
            break

model.load_state_dict(torch.load('best_model.pth'))
print(f'\nLoaded best model from epoch {best_epoch}\n')

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(train_losses, 'o-', label='Train')
axes[0].plot(val_losses, 's-', label='Val')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Progress')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(val_mae_v_list, 'o-', label='Valence MAE')
axes[1].plot(val_mae_a_list, 's-', label='Arousal MAE')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('MAE')
axes[1].set_title('Validation MAE')
axes[1].legend()
axes[1].grid(True)
plt.tight_layout()
plt.show()

print("="*60)
print("TEST SET EVALUATION")
print("="*60)

model.eval()
test_preds, test_targets = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        predictions = model(input_ids, attention_mask)
        test_preds.append(predictions.cpu())
        test_targets.append(targets.cpu())

test_preds = torch.cat(test_preds)
test_targets = torch.cat(test_targets)

test_loss = criterion(test_preds, test_targets).item()
mae_v = torch.mean(torch.abs(test_preds[:, 0] - test_targets[:, 0])).item()
mae_a = torch.mean(torch.abs(test_preds[:, 1] - test_targets[:, 1])).item()
corr_v = np.corrcoef(test_preds[:, 0], test_targets[:, 0])[0, 1]
corr_a = np.corrcoef(test_preds[:, 1], test_targets[:, 1])[0, 1]

print(f"\nTest MSE: {test_loss:.4f}")
print(f"Test MAE - Valence: {mae_v:.4f}, Arousal: {mae_a:.4f}")
print(f"Correlation - Valence: {corr_v:.4f}, Arousal: {corr_a:.4f}")

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].scatter(test_targets[:, 0], test_preds[:, 0], alpha=0.5, s=20)
axes[0].plot([-1, 1], [-1, 1], 'r--', lw=2)
axes[0].set_xlabel('True Valence')
axes[0].set_ylabel('Predicted Valence')
axes[0].set_title(f'Valence (MAE={mae_v:.3f}, r={corr_v:.3f})')
axes[0].grid(True)

axes[1].scatter(test_targets[:, 1], test_preds[:, 1], alpha=0.5, s=20)
axes[1].plot([-1, 1], [-1, 1], 'r--', lw=2)
axes[1].set_xlabel('True Arousal')
axes[1].set_ylabel('Predicted Arousal')
axes[1].set_title(f'Arousal (MAE={mae_a:.3f}, r={corr_a:.3f})')
axes[1].grid(True)
plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("EXTRACTING 128-DIM EMBEDDINGS")
print("="*60)

print("Extracting embeddings...")

model.eval()
test_embeddings = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Embeddings'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        _, embeddings = model(input_ids, attention_mask, return_embeddings=True)
        test_embeddings.append(embeddings.cpu().numpy())

test_embeddings = np.concatenate(test_embeddings, axis=0)
print(f"\nExtracted embeddings: {test_embeddings.shape}")

embeddings_df = pd.DataFrame({
    'song_id': test_df['song_id'].values,
    **{f'emb_{i}': test_embeddings[:, i] for i in range(128)},
    'true_valence': test_targets[:, 0].numpy(),
    'true_arousal': test_targets[:, 1].numpy(),
    'pred_valence': test_preds[:, 0].numpy(),
    'pred_arousal': test_preds[:, 1].numpy()
})

embeddings_df.to_csv('embeddings_128dim_with_metadata.csv', index=False)
print(f"Saved to 'embeddings_128dim_with_metadata.csv'")

np.save('test_embeddings_128dim.npy', test_embeddings)
print(f"Saved numpy array to 'test_embeddings_128dim.npy'")

print("\n" + "="*60)
print("EXAMPLE PREDICTIONS")
print("="*60)

def predict(text):
    model.eval()
    enc = tokenizer(text, add_special_tokens=True, max_length=512,
                   padding='max_length', truncation=True, return_tensors='pt')
    with torch.no_grad():
        pred, emb = model(enc['input_ids'].to(device),
                         enc['attention_mask'].to(device),
                         return_embeddings=True)
    v, a = pred[0, 0].item(), pred[0, 1].item()
    return v, a, emb[0].cpu().numpy()

examples = [
    "I'm so happy and excited, best day ever!",
    "Feeling sad and lonely, crying alone",
    "So angry I could explode with rage!",
    "Peaceful and calm, enjoying the silence",
    "Terrified and anxious, heart pounding"
]

for text in examples:
    v, a, emb = predict(text)
    print(f"\n'{text[:50]}...'")
    print(f"  Valence: {v:+.3f}, Arousal: {a:+.3f}")
    print(f"  Embedding shape: {emb.shape} (first 5 values: {emb[:5]})")

print("\n" + "="*60)
print("SAVING FINAL MODEL")
print("="*60)

torch.save({
    'model_state_dict': model.state_dict(),
    'train_losses': train_losses,
    'val_losses': val_losses,
    'test_mae_valence': mae_v,
    'test_mae_arousal': mae_a,
    'best_epoch': best_epoch
}, 'final_model.pth')

print("Saved to 'final_model.pth'")

print("\nDone")