In [72]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import os

############################
# Configurations
############################
CSV_FILE_1 = "Transcribed videos containing well-being - Arkusz1.csv"  # Your original CSV
CSV_FILE_2 = "Videos not containing well-being - Arkusz1.csv"  # The new CSV with not well-being related videos
TEXT_COLUMN = "Transcribed text"
LABEL_COLUMN = "Confidence score 1-10"
THRESHOLD_POSITIVE = 6  # Scores ≥ 5 = well-being (1), otherwise 0
THRESHOLD_NEGATIVE = 5
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 2e-5
VAL_SPLIT = 0.1
TEST_SPLIT = 0.1
MODEL_NAME = "bert-base-uncased"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [73]:
df1 = pd.read_csv(CSV_FILE_1)
df2 = pd.read_csv(CSV_FILE_2)

# For df1 (well-being confidence):
# If score ≥ THRESHOLD => well-being (1), else not well-being (0)
df1['label'] = df1[LABEL_COLUMN].apply(lambda x: 1 if x >= THRESHOLD_POSITIVE else 0)

# For df2 (not well-being confidence):
# If score ≥ THRESHOLD => not well-being (0), else well-being (1)
# This might create some "contradictory" cases if the second CSV is supposed to be purely not well-being.
# If the second CSV really only has not well-being videos, you might just set label=0 for all rows in df2.
df2['label'] = 0 #df2[LABEL_COLUMN].apply(lambda x: 0 if x >= THRESHOLD_NEGATIVE else 1)

# Combine both DataFrames
df = pd.concat([df1, df2], ignore_index=True)

texts = df[TEXT_COLUMN].astype(str).tolist()
labels = df['label'].tolist()

In [74]:
############################
# Step 2: Split into Train/Val/Test
############################
dataset_size = len(df)
test_size = int(TEST_SPLIT * dataset_size)
val_size = int(VAL_SPLIT * dataset_size)
train_size = dataset_size - val_size - test_size

indices = np.random.permutation(dataset_size)
train_indices = indices[:train_size]
val_indices = indices[train_size:train_size+val_size]
test_indices = indices[train_size+val_size:]

train_texts = [texts[i] for i in train_indices]
train_labels = [labels[i] for i in train_indices]

val_texts = [texts[i] for i in val_indices]
val_labels = [labels[i] for i in val_indices]

test_texts = [texts[i] for i in test_indices]
test_labels = [labels[i] for i in test_indices]


In [75]:
############################
# Step 3: Tokenization
############################
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [76]:
############################
# Step 4: Create PyTorch Datasets
############################
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        # Tensors are returned with batch dimension, so we index to get rid of it
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, MAX_LEN)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, MAX_LEN)

In [77]:
############################
# Step 5: Model Initialization
############################
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
############################
# Step 6: Load Pretrained Model
############################
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
from tqdm import tqdm

def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    loop = tqdm(data_loader, desc="Training", leave=False)
    for batch in loop:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=(total_loss/(loop.n+1)))
    return total_loss / len(data_loader)

def eval_model(model, data_loader):
    model.eval()
    preds = []
    trues = []
    total_loss = 0
    loop = tqdm(data_loader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['labels'].to(DEVICE)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            preds.extend(predictions.cpu().numpy())
            trues.extend(labels.cpu().numpy())
            
            loop.set_postfix(loss=(total_loss/(loop.n+1)))

    avg_loss = total_loss / len(data_loader)
    acc = accuracy_score(trues, preds)
    f1 = f1_score(trues, preds, average='weighted')
    precision = precision_score(trues, preds, average='weighted')
    recall = recall_score(trues, preds, average='weighted')
    return avg_loss, acc, f1, precision, recall

best_val_loss = float('inf')
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer)
    val_loss, val_acc, val_f1, val_prec, val_rec = eval_model(model, val_loader)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f}")

    # Early stopping or model checkpointing
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")

# Load the best model and evaluate on test
model.load_state_dict(torch.load("best_model.pt"))

Epoch 1/3


                                                                     

Train Loss: 0.6307
Val Loss: 0.6125 | Val Acc: 0.6111 | Val F1: 0.6074
Epoch 2/3


                                                                     

Train Loss: 0.3518
Val Loss: 0.6991 | Val Acc: 0.6667 | Val F1: 0.6580
Epoch 3/3


  model.load_state_dict(torch.load("best_model.pt"))


Train Loss: 0.2452
Val Loss: 0.6442 | Val Acc: 0.7222 | Val F1: 0.7083


<All keys matched successfully>

In [80]:
############################
# Step 8: Evaluate on Test Set
############################
test_loss, test_acc, test_f1, test_prec, test_rec = eval_model(model, test_loader)
print("Test Results:")
print(f"Loss: {test_loss:.4f}, Accuracy: {test_acc:.4f}, F1: {test_f1:.4f}, Precision: {test_prec:.4f}, Recall: {test_rec:.4f}")


                                                                     

Test Results:
Loss: 0.5323, Accuracy: 0.7778, F1: 0.7835, Precision: 0.8083, Recall: 0.7778




In [81]:
from transformers import BertTokenizer, BertModel
from bertviz import head_view
import torch

MODEL_NAME = "bert-base-uncased"
text = "Here is some well-being related content about self-reflection and personal growth."

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertModel.from_pretrained(MODEL_NAME, output_attentions=True)

# Tokenize input
inputs = tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Run the model
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    # outputs.attentions is a tuple with attention weights from all layers
    attentions = outputs.attentions

# Convert token IDs back to tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

# Visualize head-wise attention for each layer
# The head_view function expects certain formats for the tokens and attentions
head_view(attentions, tokens)


<IPython.core.display.Javascript object>