# 2. Text Classification 

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
# import torch
# from torch.utils.data import DataLoader, TensorDataset
# from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.font_manager import FontProperties
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
labeled_data = pd.read_csv('../filtered-labeled-data/labeled_data.csv')
unlabeled_data = pd.read_csv('../filtered-labeled-data/unlabeled_data.csv')

## 2.1 TFIDF + Random Forest/ SVM/ Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('../filtered-labeled-data/labeled_data.csv')

# Only use labeled data and apply mapping for consistency
labeled_data = data.dropna(subset=['class']).copy() # Use .copy() to avoid SettingWithCopyWarning

# Map labels for consistency with BERT splits before splitting
label_mapping = {-1: 0, 1: 1, 9: 2}
# Ensure 'class' column is of a type that can be mapped (e.g. int)
labeled_data['class'] = labeled_data['class'].astype(int).map(label_mapping)

X = labeled_data['text']
y = labeled_data['class'] # y now contains mapped labels (0, 1, 2)

# Split into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y # Stratification is now on mapped labels
)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)

# Define the pipeline for each classifier
pipe_rf = Pipeline([
    ('tfidf', tfidf),
    ('clf', RandomForestClassifier(random_state=42))
])

pipe_svm = Pipeline([
    ('tfidf', tfidf),
    ('clf', SVC(random_state=42, probability=True)) # Added probability=True for SVC if needed by some metrics, though not strictly for the current scorers
])

pipe_lr = Pipeline([
    ('tfidf', tfidf),
    ('clf', LogisticRegression(random_state=42, solver='liblinear')) # Changed solver for potentially better convergence with smaller datasets
])

# Define hyperparameter grids
param_grid_rf = {
    'clf__n_estimators': [100, 200], # Reduced for faster example
    'clf__max_depth': [10, 20, None], # Reduced options
    'clf__min_samples_leaf': [1, 2, 4]
}

param_grid_svm = {
    'clf__C': [0.1, 1, 10], # Reduced options
    'clf__kernel': ['linear', 'rbf'],
    'clf__gamma': ['scale', 'auto']
}

param_grid_lr = {
    'clf__C': [0.1, 1, 10], # Reduced options
    'clf__max_iter': [100, 200], # Adjusted max_iter
    'clf__solver': ['liblinear', 'saga'] # Adjusted solvers
}

# Define scorers
scorers = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=0),
    'recall': make_scorer(recall_score, average='weighted', zero_division=0),
    'f1_score': make_scorer(f1_score, average='weighted', zero_division=0)
}

# Set up GridSearchCV for each classifier
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # n_splits=5 is common
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=kf, scoring=scorers, refit='f1_score', n_jobs=-1)
grid_svm = GridSearchCV(pipe_svm, param_grid_svm, cv=kf, scoring=scorers, refit='f1_score', n_jobs=-1)
grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=kf, scoring=scorers, refit='f1_score', n_jobs=-1)

# Fit the models
print("Fitting Random Forest...")
grid_rf.fit(X_train, y_train)
print("Fitting SVM...")
grid_svm.fit(X_train, y_train)
print("Fitting Logistic Regression...")
grid_lr.fit(X_train, y_train)

# Function to print formatted results for both training and test sets
def print_results(name, grid, X_test_data, y_test_data): # Removed X_train, y_train as CV results are on grid object
    print(f"\n{name} Results:")
    print("Cross-validation Results (on training data):")
    print(f"  Best Score (F1): {grid.best_score_:.4f}")
    print(f"  Best Parameters: {grid.best_params_}")
    
    # Accessing CV results correctly
    best_index = grid.best_index_
    print("  Cross-validation Performance Metrics (for best estimator):")
    print(f"    Mean CV Accuracy: {grid.cv_results_['mean_test_accuracy'][best_index]:.4f}")
    print(f"    Mean CV Precision: {grid.cv_results_['mean_test_precision'][best_index]:.4f}")
    print(f"    Mean CV Recall: {grid.cv_results_['mean_test_recall'][best_index]:.4f}")
    print(f"    Mean CV F1 Score: {grid.cv_results_['mean_test_f1_score'][best_index]:.4f}")
    
    # Calculate and print test set performance
    y_pred_test = grid.predict(X_test_data)
    print("\nTest Set Performance:")
    print(f"    Accuracy: {accuracy_score(y_test_data, y_pred_test):.4f}")
    print(f"    Precision: {precision_score(y_test_data, y_pred_test, average='weighted', zero_division=0):.4f}")
    print(f"    Recall: {recall_score(y_test_data, y_pred_test, average='weighted', zero_division=0):.4f}")
    print(f"    F1 Score: {f1_score(y_test_data, y_pred_test, average='weighted', zero_division=0):.4f}")

# Print results for each model
print_results("Random Forest", grid_rf, X_test, y_test)
print_results("SVM", grid_svm, X_test, y_test)
print_results("Logistic Regression", grid_lr, X_test, y_test)

## 2.2 Bert

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import copy  # For saving the best model weights

# Load and prepare data
data_path = '../filtered-labeled-data/labeled_data.csv'
data = pd.read_csv(data_path)
filtered_data = data.dropna(subset=['class'])
label_mapping = {-1: 0, 1: 1, 9: 2}
filtered_data['class'] = filtered_data['class'].map(label_mapping)

# Split into training and test sets (80-20 split)
train_data, test_data = train_test_split(
    filtered_data, test_size=0.2, random_state=42, stratify=filtered_data['class']
)

def encode_data(tokenizer, data):
    tokens = tokenizer.batch_encode_plus(
        data['text'].tolist(),
        max_length=256,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    return tokens['input_ids'], tokens['attention_mask'], torch.tensor(data['class'].values)

def evaluate(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(b_labels.cpu().numpy())
    
    return (
        accuracy_score(all_labels, all_predictions),
        precision_score(all_labels, all_predictions, average='weighted'),
        recall_score(all_labels, all_predictions, average='weighted'),
        f1_score(all_labels, all_predictions, average='weighted')
    )

def print_results(cv_results, test_results):
    print("\nBERT Model Results:")
    print("Cross-validation Results:")
    print(f"  Best Score (F1): {np.mean([x[3] for x in cv_results]):.4f}")
    print("  Cross-validation Performance Metrics:")
    print(f"    Accuracy: {np.mean([x[0] for x in cv_results]):.4f}")
    print(f"    Precision: {np.mean([x[1] for x in cv_results]):.4f}")
    print(f"    Recall: {np.mean([x[2] for x in cv_results]):.4f}")
    print(f"    F1 Score: {np.mean([x[3] for x in cv_results]):.4f}")
    
    print("\nTest Set Performance:")
    print(f"    Accuracy: {test_results[0]:.4f}")
    print(f"    Precision: {test_results[1]:.4f}")
    print(f"    Recall: {test_results[2]:.4f}")
    print(f"    F1 Score: {test_results[3]:.4f}")

# Initialize device and tokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Prepare test data
test_inputs, test_masks, test_labels = encode_data(tokenizer, test_data)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# K-fold cross-validation
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []

max_epochs = 10     # Maximum number of training epochs, can be adjusted
patience = 2        # Early Stopping patience value, stop training when validation loss doesn't improve

# for train_idx, val_idx in kf.split(train_data):
for train_idx, val_idx in kf.split(train_data, train_data['class']):
    # 1) Reinitialize the model at the start of each fold
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3).to(device)
    
    # Prepare data for this fold
    fold_train_data = train_data.iloc[train_idx]
    fold_val_data = train_data.iloc[val_idx]
    
    train_inputs, train_masks, train_labels = encode_data(tokenizer, fold_train_data)
    val_inputs, val_masks, val_labels = encode_data(tokenizer, fold_val_data)
    
    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    train_dataloader = DataLoader(train_dataset, batch_size=16)
    val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
    val_dataloader = DataLoader(val_dataset, batch_size=16)
    
    # 2) Define optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_dataloader) * max_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # Early Stopping variables
    best_val_loss = float('inf')
    wait = 0
    best_model_weights = None
    
    # Training with Early Stopping
    for epoch in range(max_epochs):
        model.train()
        for batch in train_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            model.zero_grad()
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            # 3) Perform scheduler step after each parameter update
            scheduler.step()
        
        # Calculate current epoch validation loss
        model.eval()
        val_loss = 0.0
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                val_loss += outputs.loss.item()
        val_loss /= len(val_dataloader)
        
        # Determine whether to update the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            wait = 0
            best_model_weights = copy.deepcopy(model.state_dict())
        else:
            wait += 1
        
        # Stop training if no improvement for patience epochs
        if wait >= patience:
            break
    
    # Load the best model weights from validation performance
    if best_model_weights is not None:
        model.load_state_dict(best_model_weights)
    
    # Evaluate on validation set
    cv_results.append(evaluate(model, val_dataloader, device))

# Final evaluation on test set
test_results = evaluate(model, test_dataloader, device)

# Print results in the same format as the previous code
print_results(cv_results, test_results)

## 2.3 ModernBERT

In [None]:
# %pip install --upgrade transformers
# %pip install git+https://github.com/huggingface/transformers.git
# %pip install flash-attn

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import copy  # For saving the best model weights

# Load and prepare data
data_path = '../filtered-labeled-data/labeled_data.csv'
data = pd.read_csv(data_path)
filtered_data = data.dropna(subset=['class'])

# Map original labels to 0, 1, 2
label_mapping = {-1: 0, 1: 1, 9: 2}
filtered_data['class'] = filtered_data['class'].map(label_mapping)

# Split into training and test sets (80-20 split)
train_data, test_data = train_test_split(
    filtered_data, test_size=0.2, random_state=42, stratify=filtered_data['class']
)

def encode_data(tokenizer, data):
    tokens = tokenizer.batch_encode_plus(
        data['text'].tolist(),
        max_length=256,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    return tokens['input_ids'], tokens['attention_mask'], torch.tensor(data['class'].values)

def evaluate(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(b_labels.cpu().numpy())
    
    return (
        accuracy_score(all_labels, all_predictions),
        precision_score(all_labels, all_predictions, average='weighted'),
        recall_score(all_labels, all_predictions, average='weighted'),
        f1_score(all_labels, all_predictions, average='weighted')
    )

def print_results(cv_results, test_results):
    print("\nModernBERT Model Results:")
    print("Cross-validation Results:")
    print(f"  Best Score (F1): {np.mean([x[3] for x in cv_results]):.4f}")
    print("  Cross-validation Performance Metrics:")
    print(f"    Accuracy: {np.mean([x[0] for x in cv_results]):.4f}")
    print(f"    Precision: {np.mean([x[1] for x in cv_results]):.4f}")
    print(f"    Recall: {np.mean([x[2] for x in cv_results]):.4f}")
    print(f"    F1 Score: {np.mean([x[3] for x in cv_results]):.4f}")
    
    print("\nTest Set Performance:")
    print(f"    Accuracy: {test_results[0]:.4f}")
    print(f"    Precision: {test_results[1]:.4f}")
    print(f"    Recall: {test_results[2]:.4f}")
    print(f"    F1 Score: {test_results[3]:.4f}")

# Initialize device and tokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Use ModernBERT tokenizer
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare test data
test_inputs, test_masks, test_labels = encode_data(tokenizer, test_data)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# K-fold cross-validation
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []

max_epochs = 10     # Maximum number of training epochs, can be adjusted
patience = 2        # Early Stopping patience value, stop training when validation loss doesn't improve

# for train_idx, val_idx in kf.split(train_data):
for train_idx, val_idx in kf.split(train_data, train_data['class']):

    # 1) Reinitialize the model at the start of each fold using ModernBERT
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3
    ).to(device)
    
    # Prepare data for this fold
    fold_train_data = train_data.iloc[train_idx]
    fold_val_data = train_data.iloc[val_idx]
    
    train_inputs, train_masks, train_labels = encode_data(tokenizer, fold_train_data)
    val_inputs, val_masks, val_labels = encode_data(tokenizer, fold_val_data)
    
    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    train_dataloader = DataLoader(train_dataset, batch_size=16)
    val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
    val_dataloader = DataLoader(val_dataset, batch_size=16)
    
    # 2) Define optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_dataloader) * max_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # Early Stopping variables
    best_val_loss = float('inf')
    wait = 0
    best_model_weights = None
    
    # Training with Early Stopping
    for epoch in range(max_epochs):
        model.train()
        for batch in train_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            model.zero_grad()
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            # 3) Perform scheduler step after each parameter update
            scheduler.step()
        
        # Calculate current epoch validation loss
        model.eval()
        val_loss = 0.0
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                val_loss += outputs.loss.item()
        val_loss /= len(val_dataloader)
        
        # Determine whether to update the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            wait = 0
            best_model_weights = copy.deepcopy(model.state_dict())
        else:
            wait += 1
        
        # Stop training if no improvement for patience epochs
        if wait >= patience:
            break
    
    # Load the best model weights from validation performance
    if best_model_weights is not None:
        model.load_state_dict(best_model_weights)
    
    # Evaluate on validation set
    cv_results.append(evaluate(model, val_dataloader, device))

# Final evaluation on test set
test_results = evaluate(model, test_dataloader, device)

# Print results
print_results(cv_results, test_results)

## 2.4 RoBERTa

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold # Changed KFold to StratifiedKFold for consistency
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import copy  # For saving the best model weights

# Load and prepare data
data_path = '../filtered-labeled-data/labeled_data.csv'
data = pd.read_csv(data_path)
filtered_data = data.dropna(subset=['class'])

# Map original labels to 0, 1, 2
label_mapping = {-1: 0, 1: 1, 9: 2}
filtered_data['class'] = filtered_data['class'].map(label_mapping)

# Split into training and test sets (80-20 split)
train_data, test_data = train_test_split(
    filtered_data, test_size=0.2, random_state=42, stratify=filtered_data['class']
)

def encode_data(tokenizer, data):
    tokens = tokenizer.batch_encode_plus(
        data['text'].tolist(),
        max_length=256,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    return tokens['input_ids'], tokens['attention_mask'], torch.tensor(data['class'].values)

def evaluate(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(b_labels.cpu().numpy())

    return (
        accuracy_score(all_labels, all_predictions),
        precision_score(all_labels, all_predictions, average='weighted', zero_division=0), # Added zero_division
        recall_score(all_labels, all_predictions, average='weighted', zero_division=0),  # Added zero_division
        f1_score(all_labels, all_predictions, average='weighted', zero_division=0)       # Added zero_division
    )

def print_results(cv_results, test_results):
    print("\nRoBERTa Model Results:") # Changed model name
    print("Cross-validation Results:")
    # Calculate mean of F1 scores from CV, ensuring cv_results is not empty
    mean_f1_cv = np.mean([x[3] for x in cv_results]) if cv_results else float('nan')
    print(f"  Mean F1 Score (CV): {mean_f1_cv:.4f}")
    # print(f"  Best Score (F1): {np.mean([x[3] for x in cv_results]):.4f}") # Original line, if preferred
    print("  Cross-validation Performance Metrics (Mean):")
    mean_acc_cv = np.mean([x[0] for x in cv_results]) if cv_results else float('nan')
    mean_prec_cv = np.mean([x[1] for x in cv_results]) if cv_results else float('nan')
    mean_rec_cv = np.mean([x[2] for x in cv_results]) if cv_results else float('nan')
    print(f"    Accuracy: {mean_acc_cv:.4f}")
    print(f"    Precision: {mean_prec_cv:.4f}")
    print(f"    Recall: {mean_rec_cv:.4f}")
    print(f"    F1 Score: {mean_f1_cv:.4f}") # Re-iterating mean F1 for clarity

    print("\nTest Set Performance:")
    print(f"    Accuracy: {test_results[0]:.4f}")
    print(f"    Precision: {test_results[1]:.4f}")
    print(f"    Recall: {test_results[2]:.4f}")
    print(f"    F1 Score: {test_results[3]:.4f}")

# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Use RoBERTa tokenizer and model
model_name = "roberta-base"  # Changed to RoBERTa
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare test data
test_inputs, test_masks, test_labels = encode_data(tokenizer, test_data)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16) # Consistent batch size

# K-fold cross-validation (using StratifiedKFold for consistency)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = [] # Stores (accuracy, precision, recall, f1) for each fold's validation

max_epochs = 10     # Maximum number of training epochs, can be adjusted
patience = 2        # Early Stopping patience value, stop training when validation loss doesn't improve
learning_rate = 2e-5 # Standard learning rate

# To store the best model across all folds based on validation F1 score
best_fold_model_state = None
best_fold_f1_score = -1.0
fold_count = 0

for train_idx, val_idx in kf.split(train_data, train_data['class']): # Ensure stratification for CV splits
    fold_count += 1
    print(f"\n--- Fold {fold_count}/5 ---")
    # 1) Reinitialize the model at the start of each fold using RoBERTa
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3  # Assuming 3 classes based on label_mapping
    ).to(device)

    # Prepare data for this fold
    fold_train_data = train_data.iloc[train_idx]
    fold_val_data = train_data.iloc[val_idx]

    train_inputs, train_masks, train_labels = encode_data(tokenizer, fold_train_data)
    val_inputs, val_masks, val_labels = encode_data(tokenizer, fold_val_data)

    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True) # Added shuffle=True for training
    val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
    val_dataloader = DataLoader(val_dataset, batch_size=16) # Consistent batch size

    # 2) Define optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * max_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0, # Can be adjusted, e.g., 0.1 * total_steps
        num_training_steps=total_steps
    )

    # Early Stopping variables for the current fold
    best_val_loss_fold = float('inf')
    wait_fold = 0
    best_model_weights_fold = None

    # Training with Early Stopping for the current fold
    for epoch in range(max_epochs):
        model.train()
        total_train_loss = 0
        for batch in train_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            model.zero_grad()
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_train_loss += loss.item() # Accumulate training loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader) # Average training loss

        # Calculate current epoch validation loss and metrics
        model.eval()
        total_val_loss = 0.0
        val_preds_fold = []
        val_labels_fold = []
        with torch.no_grad():
            for batch in val_dataloader:
                batch_device = tuple(t.to(device) for t in batch)
                b_input_ids, b_input_mask, b_labels = batch_device
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                total_val_loss += outputs.loss.item()
                logits = outputs.logits
                val_preds_fold.extend(torch.argmax(logits, dim=-1).cpu().numpy())
                val_labels_fold.extend(b_labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_dataloader) # Average validation loss
        val_f1_fold = f1_score(val_labels_fold, val_preds_fold, average='weighted', zero_division=0)

        print(f"Epoch {epoch+1}/{max_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val F1: {val_f1_fold:.4f}")

        # Determine whether to update the best model for this fold
        if avg_val_loss < best_val_loss_fold: # Using val_loss for early stopping criterion
            best_val_loss_fold = avg_val_loss
            wait_fold = 0
            best_model_weights_fold = copy.deepcopy(model.state_dict())
            print(f"Validation loss improved for fold {fold_count}. Saving model for epoch {epoch+1}")
        else:
            wait_fold += 1
            print(f"Validation loss did not improve for {wait_fold} epoch(s) for fold {fold_count}.")

        # Stop training if no improvement for patience epochs
        if wait_fold >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs for fold {fold_count}.")
            break

    # Load the best model weights for this fold based on validation performance
    if best_model_weights_fold is not None:
        model.load_state_dict(best_model_weights_fold)
        print(f"Loaded best model weights for fold {fold_count} based on validation performance.")

    # Evaluate on validation set for this fold (using the best model for this fold)
    fold_val_metrics = evaluate(model, val_dataloader, device) # (acc, prec, rec, f1)
    cv_results.append(fold_val_metrics)
    print(f"Fold {fold_count} Validation Metrics: Acc: {fold_val_metrics[0]:.4f}, Prec: {fold_val_metrics[1]:.4f}, Rec: {fold_val_metrics[2]:.4f}, F1: {fold_val_metrics[3]:.4f}")

    # Check if this fold's model is the best overall based on validation F1
    if fold_val_metrics[3] > best_fold_f1_score:
        best_fold_f1_score = fold_val_metrics[3]
        best_fold_model_state = copy.deepcopy(model.state_dict())
        print(f"*** New best model found in fold {fold_count} with Val F1: {best_fold_f1_score:.4f} ***")


# After all folds, load the best model state (from the fold with the highest validation F1) for final test set evaluation.
if best_fold_model_state is not None:
    print(f"\nLoading the best model overall (from fold with Val F1: {best_fold_f1_score:.4f}) for final test evaluation...")
    model.load_state_dict(best_fold_model_state)
else:
    print("\nNo best model state saved (e.g., if training was very short or had issues). Using model from the last fold for testing.")
    # The 'model' variable will hold the model from the last completed fold.

print("\nEvaluating on the test set using the best model from cross-validation...")
# Final evaluation on test set
test_results_tuple = evaluate(model, test_dataloader, device) # Returns a tuple

# Print results
print_results(cv_results, test_results_tuple) # cv_results is a list of tuples, test_results_tuple is a single tuple


## 2.5 DeBERTa

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import copy  # For saving the best model weights

# Load and prepare data
data_path = '../filtered-labeled-data/labeled_data.csv'
data = pd.read_csv(data_path)
filtered_data = data.dropna(subset=['class'])

# Map original labels to 0, 1, 2
label_mapping = {-1: 0, 1: 1, 9: 2}
filtered_data['class'] = filtered_data['class'].map(label_mapping)

# Split into training and test sets (80-20 split)
train_data, test_data = train_test_split(
    filtered_data, test_size=0.2, random_state=42, stratify=filtered_data['class']
)

def encode_data(tokenizer, data):
    tokens = tokenizer.batch_encode_plus(
        data['text'].tolist(),
        max_length=256, # DeBERTa can handle longer, but 256 is often a good balance
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    return tokens['input_ids'], tokens['attention_mask'], torch.tensor(data['class'].values)

def evaluate(model, dataloader, device):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(b_labels.cpu().numpy())

    return (
        accuracy_score(all_labels, all_predictions),
        precision_score(all_labels, all_predictions, average='weighted', zero_division=0),
        recall_score(all_labels, all_predictions, average='weighted', zero_division=0),
        f1_score(all_labels, all_predictions, average='weighted', zero_division=0)
    )

def print_results(model_display_name, cv_results, test_results_tuple):
    print(f"\n{model_display_name} Model Results:")
    print("Cross-validation Results:")
    mean_f1_cv = np.mean([x[3] for x in cv_results]) if cv_results else float('nan')
    print(f"  Mean F1 Score (CV): {mean_f1_cv:.4f}")
    print("  Cross-validation Performance Metrics (Mean):")
    mean_acc_cv = np.mean([x[0] for x in cv_results]) if cv_results else float('nan')
    mean_prec_cv = np.mean([x[1] for x in cv_results]) if cv_results else float('nan')
    mean_rec_cv = np.mean([x[2] for x in cv_results]) if cv_results else float('nan')
    print(f"    Accuracy: {mean_acc_cv:.4f}")
    print(f"    Precision: {mean_prec_cv:.4f}")
    print(f"    Recall: {mean_rec_cv:.4f}")
    print(f"    F1 Score: {mean_f1_cv:.4f}")

    print("\nTest Set Performance:")
    print(f"    Accuracy: {test_results_tuple[0]:.4f}")
    print(f"    Precision: {test_results_tuple[1]:.4f}")
    print(f"    Recall: {test_results_tuple[2]:.4f}")
    print(f"    F1 Score: {test_results_tuple[3]:.4f}")

# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- DeBERTa Model Configuration ---
model_name = "microsoft/deberta-v3-base"
model_display_name = "DeBERTa-v3-base"
# You could also try other variants like "microsoft/deberta-base"
# model_name = "microsoft/deberta-base"
# model_display_name = "DeBERTa-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare test data
test_inputs, test_masks, test_labels = encode_data(tokenizer, test_data)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# K-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []

max_epochs = 10
patience = 2
# DeBERTa models, especially v3, often benefit from a slightly lower learning rate.
learning_rate = 1e-5  # Try 1e-5 for DeBERTa-v3
# learning_rate = 2e-5 # Standard LR, could also work

best_fold_model_state = None
best_fold_f1_score = -1.0
fold_count = 0

for train_idx, val_idx in kf.split(train_data, train_data['class']):
    fold_count += 1
    print(f"\n--- Fold {fold_count}/5 ({model_display_name}) ---")

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3
    ).to(device)

    fold_train_data = train_data.iloc[train_idx]
    fold_val_data = train_data.iloc[val_idx]

    train_inputs, train_masks, train_labels = encode_data(tokenizer, fold_train_data)
    val_inputs, val_masks, val_labels = encode_data(tokenizer, fold_val_data)

    train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
    val_dataloader = DataLoader(val_dataset, batch_size=16)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_dataloader) * max_epochs
    # Using a small portion of total steps for warmup is common
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps), # e.g., 10% of total steps for warmup
        num_training_steps=total_steps
    )

    best_val_loss_fold = float('inf')
    wait_fold = 0
    best_model_weights_fold = None

    for epoch in range(max_epochs):
        model.train()
        total_train_loss = 0
        for batch in train_dataloader:
            batch_to_device = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch_to_device
            model.zero_grad()
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_train_loss / len(train_dataloader)

        model.eval()
        total_val_loss = 0.0
        val_preds_fold = []
        val_labels_fold = []
        with torch.no_grad():
            for batch in val_dataloader:
                batch_to_device = tuple(t.to(device) for t in batch)
                b_input_ids, b_input_mask, b_labels = batch_to_device
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                total_val_loss += outputs.loss.item()
                logits = outputs.logits
                val_preds_fold.extend(torch.argmax(logits, dim=-1).cpu().numpy())
                val_labels_fold.extend(b_labels.cpu().numpy())
        avg_val_loss = total_val_loss / len(val_dataloader)
        val_f1_fold = f1_score(val_labels_fold, val_preds_fold, average='weighted', zero_division=0)

        print(f"Epoch {epoch+1}/{max_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val F1: {val_f1_fold:.4f}")

        if avg_val_loss < best_val_loss_fold: # Early stopping based on validation loss
            best_val_loss_fold = avg_val_loss
            wait_fold = 0
            best_model_weights_fold = copy.deepcopy(model.state_dict())
            print(f"Validation loss improved for fold {fold_count}. Saving model for epoch {epoch+1}")
        else:
            wait_fold += 1
            print(f"Validation loss did not improve for {wait_fold} epoch(s) for fold {fold_count}.")

        if wait_fold >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs for fold {fold_count}.")
            break
            
    if best_model_weights_fold is not None:
        model.load_state_dict(best_model_weights_fold)
        print(f"Loaded best model weights for fold {fold_count} based on validation performance.")

    fold_val_metrics = evaluate(model, val_dataloader, device) # (acc, prec, rec, f1)
    cv_results.append(fold_val_metrics)
    print(f"Fold {fold_count} Validation Metrics: Acc: {fold_val_metrics[0]:.4f}, Prec: {fold_val_metrics[1]:.4f}, Rec: {fold_val_metrics[2]:.4f}, F1: {fold_val_metrics[3]:.4f}")

    # Check if this fold's model is the best overall based on validation F1
    if fold_val_metrics[3] > best_fold_f1_score:
        best_fold_f1_score = fold_val_metrics[3]
        best_fold_model_state = copy.deepcopy(model.state_dict())
        print(f"*** New best model found in fold {fold_count} with Val F1: {best_fold_f1_score:.4f} ***")

# After all folds, load the best model state for final test set evaluation.
if best_fold_model_state is not None:
    print(f"\nLoading the best {model_display_name} model overall (from fold with Val F1: {best_fold_f1_score:.4f}) for final test evaluation...")
    model.load_state_dict(best_fold_model_state)
else:
    print(f"\nNo best {model_display_name} model state saved (e.g., if training was too short or had issues). Using model from the last fold for testing.")

print(f"\nEvaluating on the test set using the best {model_display_name} model from cross-validation...")
test_results_tuple = evaluate(model, test_dataloader, device)

# Print final results
print_results(model_display_name, cv_results, test_results_tuple)

## 2.6 Classifiers Plot

```python
BERT Model Results:
Cross-validation Results:
  Best Score (F1): 0.8760
  Cross-validation Performance Metrics:
    Accuracy: 0.8765
    Precision: 0.8790
    Recall: 0.8765
    F1 Score: 0.8760

Test Set Performance:
    Accuracy: 0.8523
    Precision: 0.8579
    Recall: 0.8523
    F1 Score: 0.8525

RoBERTa Model Results:
Cross-validation Results:
  Mean F1 Score (CV): 0.8793
  Cross-validation Performance Metrics (Mean):
    Accuracy: 0.8789
    Precision: 0.8830
    Recall: 0.8789
    F1 Score: 0.8793

Test Set Performance:
    Accuracy: 0.8646
    Precision: 0.8666
    Recall: 0.8646
    F1 Score: 0.8644

DeBERTa-v3-base Model Results:
Cross-validation Results:
  Mean F1 Score (CV): 0.8873
  Cross-validation Performance Metrics (Mean):
    Accuracy: 0.8868
    Precision: 0.8892
    Recall: 0.8855
    F1 Score: 0.8873

Test Set Performance:
    Accuracy: 0.8705
    Precision: 0.8715
    Recall: 0.8700
    F1 Score: 0.8709

ModernBERT Model Results:
Cross-validation Results:
  Best Score (F1): 0.8391
  Cross-validation Performance Metrics:
    Accuracy: 0.8418
    Precision: 0.8456
    Recall: 0.8418
    F1 Score: 0.8391

Test Set Performance:
    Accuracy: 0.8492
    Precision: 0.8508
    Recall: 0.8492
    F1 Score: 0.8486
```

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import matplotlib.ticker as ticker
# Set global font properties
font_props = FontProperties(family='Times New Roman', weight='bold')
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.weight'] = 'bold'
plt.rcParams['font.size'] = 24

# Create DataFrames for each metric
precision_data = pd.DataFrame({
    'Model': ['BERT', 'RoBERTa', 'DeBERTa', 'ModernBERT'],
    'CV': [0.8790, 0.8830, 0.8892, 0.8456],
    'Test': [0.8579, 0.8666, 0.8715, 0.8508]
})

recall_data = pd.DataFrame({
    'Model': ['BERT', 'RoBERTa', 'DeBERTa', 'ModernBERT'],
    'CV': [0.8765, 0.8789, 0.8855, 0.8418],
    'Test': [0.8523, 0.8646, 0.8700, 0.8492]
})

f1_data = pd.DataFrame({
    'Model': ['BERT', 'RoBERTa', 'DeBERTa', 'ModernBERT'],
    'CV': [0.8760, 0.8793, 0.8873, 0.8391],
    'Test': [0.8525, 0.8644, 0.8709, 0.8486]
})

accuracy_data = pd.DataFrame({
    'Model': ['BERT', 'RoBERTa', 'DeBERTa', 'ModernBERT'],
    'CV': [0.8765, 0.8789, 0.8868, 0.8418],
    'Test': [0.8523, 0.8646, 0.8705, 0.8492]
})

# Create figure with 2x2 subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
plt.subplots_adjust(hspace=0.3, wspace=0.3)

def create_heatmap(data, ax, title, vmin=0.81, vmax=0.88):  # Adjusted vmax to accommodate BERT's higher values
    # Create the heatmap
    heatmap = sns.heatmap(data.set_index('Model'), annot=True, cmap='Blues', 
                         fmt='.3f', vmin=vmin, vmax=vmax, ax=ax, cbar=True,
                         annot_kws={'size': 24, 'font': font_props})
    
    # Format colorbar
    colorbar = heatmap.collections[0].colorbar
    colorbar.set_ticks(ticker.LinearLocator(5))
    colorbar.set_ticklabels([f'{x:.3f}' for x in colorbar.get_ticks()])
    colorbar.ax.tick_params(labelsize=24)
    for label in colorbar.ax.get_yticklabels():
        label.set_font_properties(font_props)

    # Configure y-axis labels
    if ax in [axes[0,0], axes[1,0]]:
        ax.set_ylabel('', fontsize=24, font=font_props)
        ax.tick_params(axis='y', which='major', labelsize=24)
        labels = ax.get_yticklabels()
        ax.set_yticklabels(labels, rotation=0, font=font_props)
    else:
        ax.set_ylabel('')
        ax.set_yticklabels([])

    # Configure x-axis labels
    ax.tick_params(axis='x', which='major', labelsize=24)
    for label in ax.get_xticklabels():
        label.set_font_properties(font_props)

    # Set title
    ax.set_title(title, pad=20, font=font_props)

# Create heatmaps
create_heatmap(precision_data, axes[0,0], 'a. Precision')
create_heatmap(recall_data, axes[0,1], 'b. Recall')
create_heatmap(f1_data, axes[1,0], 'c. F1-Score')
create_heatmap(accuracy_data, axes[1,1], 'd. Accuracy')


plt.tight_layout()
plt.savefig('classifiers.pdf', format='pdf', bbox_inches='tight', dpi=300)
plt.show()

## 2.7 Classification

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import copy  # For saving the best model weights


# --- Basic Configuration ---
MODEL_NAME = "microsoft/deberta-v3-base"
MODEL_DISPLAY_NAME = "DeBERTa-v3-base"
LABELED_DATA_PATH = '../filtered-labeled-data/labeled_data.csv'
UNLABELED_DATA_PATH = '../filtered-labeled-data/unlabeled_data.csv'
MERGED_OUTPUT_FILENAME = '../filtered-labeled-data/classed_data.csv' # Updated filename for clarity

BATCH_SIZE_TRAIN_VAL = 16
BATCH_SIZE_PREDICT = 32
MAX_LENGTH = 256
LEARNING_RATE = 1e-5
MAX_EPOCHS = 10
PATIENCE = 2
VALIDATION_SET_SIZE = 0.1
RANDOM_STATE = 42

# --- Device Setup ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- Tokenizer Initialization ---
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print(f"Tokenizer ({MODEL_NAME}) initialized successfully.")
except Exception as e:
    print(f"Error: Failed to initialize Tokenizer: {e}")
    raise

# --- 1. Load and Preprocess Labeled Data (to be used for training and validation) ---
print(f"Loading labeled data from: {LABELED_DATA_PATH}")
try:
    labeled_data_full_original_cols = pd.read_csv(LABELED_DATA_PATH)
    print(f"Labeled data loaded successfully. Contains {len(labeled_data_full_original_cols)} records.")
except FileNotFoundError:
    print(f"Error: Labeled data file not found at {LABELED_DATA_PATH}")
    raise

labeled_data_for_training = labeled_data_full_original_cols.copy()
labeled_data_for_training.dropna(subset=['class'], inplace=True)
label_mapping = {-1: 0, 1: 1, 9: 2}
labeled_data_for_training['class'] = labeled_data_for_training['class'].map(label_mapping)
labeled_data_for_training.dropna(subset=['class'], inplace=True)
labeled_data_for_training['class'] = labeled_data_for_training['class'].astype(int)
print(f"Number of labeled records for training after preprocessing: {len(labeled_data_for_training)}")
if 'text' not in labeled_data_for_training.columns:
    print("Error: 'text' column not found in labeled data.")
    raise ValueError("'text' column is required in labeled data.")
labeled_data_for_training.dropna(subset=['text'], inplace=True)
print(f"Number of labeled records for training after removing NaN values in 'text' column: {len(labeled_data_for_training)}")

# --- 2. Load Unlabeled Data (to be used for prediction) ---
print(f"Loading unlabeled data from: {UNLABELED_DATA_PATH}")
try:
    unlabeled_df = pd.read_csv(UNLABELED_DATA_PATH)
    print(f"Unlabeled data loaded successfully. Contains {len(unlabeled_df)} records.")
except FileNotFoundError:
    print(f"Error: Unlabeled data file not found at {UNLABELED_DATA_PATH}")
    raise
if 'text' not in unlabeled_df.columns:
    print("Error: 'text' column not found in unlabeled data.")
    raise ValueError("'text' column not found in unlabeled data.")
unlabeled_df.dropna(subset=['text'], inplace=True)
original_unlabeled_count = len(unlabeled_df)
unlabeled_df_original_cols = unlabeled_df.copy() 
print(f"Unlabeled data after cleaning (removing NaN values in 'text' column) has {original_unlabeled_count} records.")

# --- Helper Function: Encode Data ---
def encode_data_with_labels(tokenizer_obj, data_df, max_len=MAX_LENGTH):
    print(f"Starting to encode {len(data_df)} texts for labeled data...")
    tokens = tokenizer_obj.batch_encode_plus(
        data_df['text'].tolist(), max_length=max_len, padding='max_length', truncation=True, return_tensors="pt"
    )
    print("Text encoding completed.")
    return tokens['input_ids'], tokens['attention_mask'], torch.tensor(data_df['class'].values)

def encode_data_for_prediction(tokenizer_obj, texts_list, max_len=MAX_LENGTH):
    print(f"Starting to encode {len(texts_list)} texts for prediction...")
    encoded_output = tokenizer_obj.batch_encode_plus(
        texts_list, max_length=max_len, padding='max_length', truncation=True, return_tensors="pt"
    )
    print("Text encoding completed.")
    return encoded_output['input_ids'], encoded_output['attention_mask']

# --- 3. Prepare Data Loaders (Dataloaders) ---
if len(labeled_data_for_training) > 0:
    final_train_df, final_val_df = train_test_split(
        labeled_data_for_training, test_size=VALIDATION_SET_SIZE, random_state=RANDOM_STATE, stratify=labeled_data_for_training['class']
    )
    print(f"Labeled data split into: {len(final_train_df)} training data, {len(final_val_df)} validation data.")
    train_inputs, train_masks, train_labels = encode_data_with_labels(tokenizer, final_train_df)
    val_inputs, val_masks, val_labels = encode_data_with_labels(tokenizer, final_val_df)
    final_train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
    final_train_dataloader = DataLoader(final_train_dataset, batch_size=BATCH_SIZE_TRAIN_VAL, shuffle=True)
    final_val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
    final_val_dataloader = DataLoader(final_val_dataset, batch_size=BATCH_SIZE_TRAIN_VAL)
else:
    print("Error: No valid labeled data available for training.")
    raise ValueError("No labeled data available.")

# --- 4. Train Model ---
print(f"\n--- Training final model ({MODEL_DISPLAY_NAME}) using labeled data ---")
try:
    final_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label_mapping)).to(device)
    print(f"Model ({MODEL_NAME}) initialized successfully.")
except Exception as e:
    print(f"Error: Failed to initialize model: {e}")
    raise
final_optimizer = AdamW(final_model.parameters(), lr=LEARNING_RATE)
final_total_steps = len(final_train_dataloader) * MAX_EPOCHS
final_scheduler = get_linear_schedule_with_warmup(
    final_optimizer, num_warmup_steps=int(0.1 * final_total_steps), num_training_steps=final_total_steps
)
best_val_loss_final = float('inf')
wait_final = 0
best_model_weights_final = None
for epoch in range(MAX_EPOCHS):
    final_model.train()
    total_train_loss_final = 0
    for batch in final_train_dataloader:
        batch_to_device = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch_to_device
        final_model.zero_grad()
        outputs = final_model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss_final += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(final_model.parameters(), 1.0)
        final_optimizer.step()
        final_scheduler.step()
    avg_train_loss_final = total_train_loss_final / len(final_train_dataloader)
    final_model.eval()
    total_val_loss_final = 0.0
    val_preds_final = []
    val_labels_final_list = []
    with torch.no_grad():
        for batch in final_val_dataloader:
            batch_to_device = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch_to_device
            outputs = final_model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            total_val_loss_final += outputs.loss.item()
            logits = outputs.logits
            val_preds_final.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            val_labels_final_list.extend(b_labels.cpu().numpy())
    avg_val_loss_final = total_val_loss_final / len(final_val_dataloader)
    val_f1_final = f1_score(val_labels_final_list, val_preds_final, average='weighted', zero_division=0)
    val_accuracy_final = accuracy_score(val_labels_final_list, val_preds_final)
    print(f"Epoch {epoch+1}/{MAX_EPOCHS} | Training Loss: {avg_train_loss_final:.4f} | Validation Loss: {avg_val_loss_final:.4f} | Validation F1: {val_f1_final:.4f} | Accuracy: {val_accuracy_final:.4f}")
    if avg_val_loss_final < best_val_loss_final:
        best_val_loss_final = avg_val_loss_final
        wait_final = 0
        best_model_weights_final = copy.deepcopy(final_model.state_dict())
        print(f"Validation loss improved. Saving model for epoch {epoch+1}")
    else:
        wait_final += 1
        print(f"Validation loss did not improve for {wait_final} epochs.")
    if wait_final >= PATIENCE:
        print(f"Early stopping triggered for final model training after {epoch+1} epochs.")
        break
if best_model_weights_final:
    final_model.load_state_dict(best_model_weights_final)
    print("Loaded best weights for final model based on validation performance.")
else:
    print("Warning: Validation loss did not improve, using model state from last epoch (or initial).")

# --- 5. Predict for Unlabeled Data ---
predicted_unlabeled_df = None
if original_unlabeled_count > 0:
    unlabeled_texts_for_prediction = unlabeled_df_original_cols['text'].tolist()
    pred_inputs, pred_masks = encode_data_for_prediction(tokenizer, unlabeled_texts_for_prediction)
    pred_dataset = TensorDataset(pred_inputs, pred_masks)
    pred_dataloader = DataLoader(pred_dataset, batch_size=BATCH_SIZE_PREDICT)
    print(f"\n--- Predicting classes for {original_unlabeled_count} unlabeled data using {MODEL_DISPLAY_NAME} model ---")
    final_model.eval()
    all_unlabeled_predictions = []
    with torch.no_grad():
        for batch_idx, batch in enumerate(pred_dataloader):
            b_input_ids, b_input_mask = tuple(t.to(device) for t in batch)
            outputs = final_model(b_input_ids, attention_mask=b_input_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_unlabeled_predictions.extend(predictions.cpu().numpy())
            if (batch_idx + 1) % 100 == 0:
                 processed_count = (batch_idx + 1) * pred_dataloader.batch_size
                 print(f"Predicted {min(processed_count, original_unlabeled_count)} / {original_unlabeled_count} unlabeled data...")
    print(f"Prediction completed for all {original_unlabeled_count} unlabeled data.")
    
    predicted_unlabeled_df = unlabeled_df_original_cols.copy()
    predicted_unlabeled_df['class'] = all_unlabeled_predictions 

    print("\nDistribution of predicted classes in unlabeled data (0, 1, 2) stored in 'class' column:")
    print("Percentage distribution:")
    print(predicted_unlabeled_df['class'].value_counts(normalize=True).sort_index())
    print("Count distribution:")
    print(predicted_unlabeled_df['class'].value_counts().sort_index())
else:
    print("No valid unlabeled data available for prediction.")

# --- 6. Prepare and Merge Data ---
labeled_data_final_merge = labeled_data_full_original_cols.copy()
if 'class' in labeled_data_final_merge.columns:
    # IMPORTANT: We use the original 'class' column from the CSV for labeled data before mapping for training
    # This 'class' column (e.g. -1, 1, 9) from the original file will be used directly later for reverse mapping.
    # For the purpose of merging, we need to ensure it's consistent if we were to do any operations on it *before* reverse mapping.
    # However, since the request is to map *back* at the end, we can keep its original form (or a copy of it)
    # and apply the label_mapping ({-1:0, 1:1, 9:2}) only for training, and then the reverse_label_mapping before saving.

    # For merging, we will first get the 0,1,2 mapping for labeled data to be consistent during concat if needed.
    # But the *final* output will use reverse mapping.
    
    # Let's create a temporary 'class_mapped_for_merge' for labeled data
    # The original 'class' column from labeled_data_full_original_cols is kept for reverse mapping later.
    temp_labeled_for_merge = labeled_data_full_original_cols.copy()
    temp_labeled_for_merge.dropna(subset=['class'], inplace=True) # Original class values
    temp_labeled_for_merge['class_numeric'] = temp_labeled_for_merge['class'].map(label_mapping) # Mapped to 0,1,2
    temp_labeled_for_merge.dropna(subset=['class_numeric'], inplace=True)
    if not temp_labeled_for_merge.empty:
         temp_labeled_for_merge['class_numeric'] = temp_labeled_for_merge['class_numeric'].astype(int)
    temp_labeled_for_merge['data_source'] = 'labeled_data'
    # `temp_labeled_for_merge` now has 'class' (original e.g. -1,1,9) and 'class_numeric' (0,1,2)

else:
    print("Warning: 'class' column missing in original labeled data. It cannot be processed for merging or reverse mapping.")
    temp_labeled_for_merge = pd.DataFrame({'data_source': ['labeled_data'] * len(labeled_data_full_original_cols)}) # create a placeholder
    # Add other necessary columns if they exist in labeled_data_full_original_cols and are expected in merge
    for col in labeled_data_full_original_cols.columns:
        if col not in temp_labeled_for_merge.columns:
            temp_labeled_for_merge[col] = labeled_data_full_original_cols[col]
    temp_labeled_for_merge['class'] = np.nan # Ensure 'class' column exists for merging, will be NaN
    temp_labeled_for_merge['class_numeric'] = np.nan


merged_df = pd.DataFrame() 

if predicted_unlabeled_df is not None:
    unlabeled_data_final_merge = predicted_unlabeled_df.copy() 
    # 'class' column in unlabeled_data_final_merge contains predictions (0,1,2)
    unlabeled_data_final_merge.rename(columns={'class': 'class_numeric'}, inplace=True) # rename to 'class_numeric' for consistency
    unlabeled_data_final_merge['data_source'] = 'unlabeled_predicted'
    # We will add an empty 'class' column here that will be filled by reverse mapping later
    unlabeled_data_final_merge['class'] = np.nan 

    print(f"\nPreparing to merge labeled data and predicted unlabeled data.")
    # Select relevant columns for labeled data to merge: original columns + 'class_numeric' + 'data_source'
    # Keep original 'class' from temp_labeled_for_merge as it is (e.g. -1, 1, 9)
    cols_to_merge_labeled = list(labeled_data_full_original_cols.columns) + ['class_numeric', 'data_source']
    # Ensure 'class_numeric' is in temp_labeled_for_merge (it should be if 'class' was present)
    if 'class_numeric' not in temp_labeled_for_merge.columns and 'class' in temp_labeled_for_merge.columns: # handle case where 'class' existed but mapping failed
        temp_labeled_for_merge['class_numeric'] = np.nan


    merged_df = pd.concat([
        temp_labeled_for_merge, # Has original 'class' and 'class_numeric'
        unlabeled_data_final_merge # Has 'class_numeric' (predictions) and NaN 'class'
    ], ignore_index=True, sort=False)
    print(f"Data merging completed. Total records after merging: {len(merged_df)}")

elif not temp_labeled_for_merge.empty and 'class_numeric' in temp_labeled_for_merge.columns and not temp_labeled_for_merge['class_numeric'].isna().all():
    print("Only labeled data available (with valid classes), no unlabeled data predicted. Merged file will contain only processed labeled data.")
    merged_df = temp_labeled_for_merge.copy() 
else:
    print("Warning: No labeled data with valid classes and no predicted unlabeled data to merge and save. Output file will not be generated.")

# --- 7. Reverse Map and Save Merged Data ---
if not merged_df.empty:
    # Define the reverse mapping
    reverse_label_mapping = {v: k for k, v in label_mapping.items()} # {0: -1, 1: 1, 2: 9}
    
    print("\nReverse mapping classes back to original labels (-1, 1, 9)...")

    # For labeled data, the 'class' column already holds original values (-1, 1, 9) or NaN.
    # No reverse mapping needed for the 'class' column of labeled data.
    # Ensure it's integer if not NaN.
    labeled_mask = merged_df['data_source'] == 'labeled_data'
    if 'class' in merged_df.columns:
        # Convert to float first to handle potential NaNs, then to Int64 (pandas nullable integer)
        merged_df.loc[labeled_mask, 'class'] = pd.to_numeric(merged_df.loc[labeled_mask, 'class'], errors='coerce').astype('Int64')


    # For unlabeled_predicted data, 'class_numeric' holds 0,1,2. Map these to -1,1,9 in the 'class' column.
    unlabeled_mask = merged_df['data_source'] == 'unlabeled_predicted'
    if 'class_numeric' in merged_df.columns:
        merged_df.loc[unlabeled_mask, 'class'] = merged_df.loc[unlabeled_mask, 'class_numeric'].map(reverse_label_mapping)
        # Convert to float first to handle potential NaNs from mapping, then to Int64
        merged_df.loc[unlabeled_mask, 'class'] = pd.to_numeric(merged_df.loc[unlabeled_mask, 'class'], errors='coerce').astype('Int64')
    
    # Drop the temporary 'class_numeric' column as it's no longer needed
    if 'class_numeric' in merged_df.columns:
        merged_df.drop(columns=['class_numeric'], inplace=True)

    print("Reverse mapping completed. The 'class' column now contains original labels or NaNs.")

    try:
        merged_df.to_csv(MERGED_OUTPUT_FILENAME, index=False)
        print(f"\nMerged data with original class labels saved to {MERGED_OUTPUT_FILENAME}")
        print("\nDistribution of 'data_source' in merged data:")
        print(merged_df['data_source'].value_counts())

        if 'class' in merged_df.columns:
            print("\nDistribution of 'class' (original labels) in labeled_data portion of merged data:")
            if not merged_df[merged_df['data_source'] == 'labeled_data'].empty:
                print(merged_df[merged_df['data_source'] == 'labeled_data']['class'].value_counts(dropna=False).sort_index())
            else:
                print("No labeled data in merged set.")

            print("\nDistribution of 'class' (original labels) in unlabeled_predicted portion of merged data:")
            if not merged_df[merged_df['data_source'] == 'unlabeled_predicted'].empty:
                print(merged_df[merged_df['data_source'] == 'unlabeled_predicted']['class'].value_counts(dropna=False).sort_index())
            else:
                print("No unlabeled_predicted data in merged set.")
                
            print("\nTotal non-null values in 'class' column of merged data (original labels):")
            print(merged_df['class'].notna().sum())
            print("\nOverall distribution of 'class' (original labels) in merged data:")
            print(merged_df['class'].value_counts(dropna=False).sort_index())
        else:
            print("\n'class' column not found in the final merged_df.")
            
    except Exception as e:
        print(f"Error: Failed to save merged CSV file: {e}")
else:
    print(f"File {MERGED_OUTPUT_FILENAME} not generated due to no data to process.")
