In [None]:
# CELL 1: SETUP AND INSTALLATION
import os

# 1. Force TensorFlow to use Legacy Keras (Critical for BERT)
os.environ['TF_USE_LEGACY_KERAS'] = '1'

# 2. Clone Repository
if not os.path.exists("BUS-stop"):
    print("Cloning Repository...")
    !git clone https://github.com/DMCB-GIST/BUS-stop.git
else:
    print("Repo already exists.")

# 3. Enter Directory
%cd /content/BUS-stop/bus-stop-keras

# 4. Install Dependencies
print("Installing Libraries...")
!pip install -q tensorflow tf_keras
!pip install -q transformers==4.44.0 scikit-learn pandas sentencepiece datasets

# 5. Fix Model Downloads (Force fresh download)
print("Downloading BERT Model files...")
!rm -rf params/bert_base
!mkdir -p params/bert_base
!wget -q -O params/bert_base/pytorch_model.bin https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin
!wget -q -O params/bert_base/config.json https://huggingface.co/bert-base-uncased/resolve/main/config.json
!wget -q -O params/bert_base/vocab.txt https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt

# 6. Generate Data Splits
if not os.path.exists("./data/SST-2/bal/0"):
    print("Generating Data Splits...")
    !python setup_experiments.py
else:
    print("Data already generated.")

print("SETUP COMPLETE! Please run Cell 2.")

In [None]:
# CELL 2: ADVANCED TRAINING ENGINE (Scheduler + Smooth Queue)
import os
import random
import numpy as np
import tensorflow as tf
import pandas as pd
import collections
from transformers import BertTokenizer, BertConfig, TFBertModel
from pt_modeler import ConstructPtModeler
from scipy.special import softmax

# 1. Data Loader
def load_local_data(task, data_path, tokenizer, max_len, val_ratio):
    base_path = f"./data/{task}/{data_path}"

    def process_file(filename, has_label=True):
        filepath = os.path.join(base_path, filename)
        if not os.path.exists(filepath) and "test_with_gold" in filename:
            filepath = os.path.join(base_path, "test.tsv")
        if not os.path.exists(filepath): return None, None

        try: df = pd.read_csv(filepath, sep='\t', header=0)
        except: return None, None

        text_col = df.columns[0]
        for candidate in ['sentence', 'text', 'review']:
            if candidate in df.columns: text_col = candidate; break

        label_col = None
        if has_label:
            for candidate in ['label', 'target']:
                if candidate in df.columns: label_col = candidate; break
            if label_col is None and len(df.columns) > 1: label_col = df.columns[1]

        sentences = df[text_col].astype(str).tolist()
        encodings = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_len, return_tensors='np')
        x = [encodings['input_ids'], encodings['token_type_ids'], encodings['attention_mask']]
        y = df[label_col].values if (has_label and label_col) else None
        return x, y

    x_train, y_train = process_file("labeled.tsv", True)
    x_unlab, _       = process_file("unlabeled.tsv", False)
    x_test, y_test   = process_file("test_with_gold.tsv", True)

    if x_train is None: return None

    if val_ratio > 0:
        split_at = int(len(x_train[0]) * (1 - val_ratio))
        x_val = [arr[split_at:] for arr in x_train]
        x_train_new = [arr[:split_at] for arr in x_train]
        y_val = y_train[split_at:]
        y_train_new = y_train[:split_at]
        return {'x_train': x_train_new, 'y_train': y_train_new, 'x_val': x_val, 'y_val': y_val, 'x_test': x_test, 'y_test': y_test, 'x_unlab': x_unlab}
    else:
        return {'x_train': x_train, 'y_train': y_train, 'x_val': x_test, 'y_val': y_test, 'x_test': x_test, 'y_test': y_test, 'x_unlab': x_unlab}

# 2. Config Class
class Args:
    def __init__(self, mode, val_ratio, seed):
        self.task = 'SST-2'; self.data_path = 'bal/0'; self.seed = seed
        self.pt_model_checkpoint = './params/bert_base/'
        self.max_seq_length = 64
        # Slightly higher start because we decay down
        self.learning_rate = 3e-5
        self.drop_rate = 0.2; self.epochs = 15; self.batch_size = 32
        self.patience = 5; self.val_ratio = val_ratio; self.n_que = 5
        self.word_freeze = False; self.class_num = 2; self.mode = mode

# 3. Training Engine
def train_engine(experiment_name, mode='combined', val_ratio=0.0, seed=42):
    print(f"\nSTARTING: {experiment_name} | Mode: {mode} | Seed: {seed}")

    args = Args(mode, val_ratio, seed)
    tf.random.set_seed(args.seed); np.random.seed(args.seed); random.seed(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.pt_model_checkpoint)
    data = load_local_data(args.task, args.data_path, tokenizer, args.max_seq_length, args.val_ratio)
    if data is None: return 0.0, 0.0, {}, None

    # --- LR SCHEDULER ---
    train_steps = (len(data['y_train']) // args.batch_size) * args.epochs
    lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=args.learning_rate,
        decay_steps=train_steps,
        end_learning_rate=0.0,
        power=1.0
    )

    modeler = ConstructPtModeler(TFBertModel, BertConfig, args.pt_model_checkpoint, args.max_seq_length, args.class_num, args.learning_rate, args.drop_rate, args.word_freeze)
    model = modeler.build_model()

    # Use scheduler in optimizer
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    history = {'accuracy': [], 'stop_metric': [], 'save_metric': []}

    best_stop_metric = 999.0
    best_save_metric_global = -999.0
    best_model_weights = None
    patience_counter = 0
    s_class_queue = collections.deque([0.0] * args.n_que, maxlen=args.n_que)
    c_u = np.array([0.5, 0.5])

    for epoch in range(args.epochs):
        x_tr, y_tr = data['x_train'], data['y_train']
        indices = np.arange(len(y_tr)); np.random.shuffle(indices)
        for i in range(0, len(y_tr), args.batch_size):
            batch_x = [arr[indices[i:i+args.batch_size]] for arr in x_tr]
            model.train_on_batch(batch_x, y_tr[indices[i:i+args.batch_size]])

        # Metrics
        if mode == 'standard':
             val_res = model.evaluate(data['x_val'], data['y_val'], verbose=0, batch_size=args.batch_size)
             s_conf = val_res[0]
             s_class = 0.0
        else:
            pred_u = model.predict(data['x_unlab'], batch_size=args.batch_size, verbose=0)
            pred_probs = softmax(pred_u, axis=1)
            conf_u = np.max(pred_probs, axis=1)
            s_conf = np.mean(np.abs(conf_u - 0.9))
            dist_u = np.mean(pred_probs, axis=0)
            s_class = 1.0 - np.linalg.norm(dist_u - c_u)

        # Logic
        current_stop_val = s_conf
        if mode == 'class': current_stop_val = -s_class

        if current_stop_val < best_stop_metric:
            best_stop_metric = current_stop_val
            patience_counter = 0
        else:
            patience_counter += 1

        save_now = False
        if mode == 'combined':
            # SMOOTHING: Compare to Queue Average instead of Max to avoid spikes
            queue_avg = sum(s_class_queue) / len(s_class_queue) if len(s_class_queue) > 0 else 0.0
            if s_class > queue_avg:
                save_now = True
                print(f"     New Best! S_Class={s_class:.4f} > Avg(Queue)={queue_avg:.4f}")
            s_class_queue.append(s_class)

        elif mode == 'conf':
             if s_conf == best_stop_metric: save_now = True
        elif mode == 'class':
             if s_class > best_save_metric_global:
                 best_save_metric_global = s_class; save_now = True
        elif mode == 'standard':
             if s_conf == best_stop_metric: save_now = True

        if patience_counter < args.patience:
            if save_now: best_model_weights = model.get_weights()

        test_acc = model.evaluate(data['x_test'], data['y_test'], verbose=0, batch_size=args.batch_size)[1]
        history['accuracy'].append(test_acc)
        history['stop_metric'].append(s_conf)
        history['save_metric'].append(s_class)

        print(f"   Epoch {epoch+1}: Acc={test_acc:.4f} | Stop_Met={s_conf:.4f} | Save_Met={s_class:.4f}")

        if patience_counter >= args.patience:
            print("   Early Stopping Triggered")
            break

    if best_model_weights is not None:
        print("   Restoring weights from the best checkpoint...")
        model.set_weights(best_model_weights)

    final_acc = model.evaluate(data['x_test'], data['y_test'], verbose=0, batch_size=args.batch_size)[1]
    avg_acc = np.mean(history['accuracy']) if history['accuracy'] else 0.0

    return final_acc, avg_acc, history, model

In [None]:
# CELL 3: RUN EXPERIMENTS WITH SEED 42
DEFAULT_SEED = 42

print("EXPERIMENT 1/4: Original BUS-stop (Combined)")
acc_orig, avg_orig, hist_orig, model_combined = train_engine("Combined", mode='combined', seed=DEFAULT_SEED)

print("\nEXPERIMENT 2/4: Confidence Similarity Only")
acc_conf, avg_conf, hist_conf, model_conf = train_engine("Conf Only", mode='conf', seed=DEFAULT_SEED)

print("\nEXPERIMENT 3/4: Class Distribution Only")
acc_class, avg_class, hist_class, model_class = train_engine("Class Only", mode='class', seed=DEFAULT_SEED)

print("\nEXPERIMENT 4/4: Standard Validation (Baseline)")
acc_std, avg_std, hist_std, model_std = train_engine("Standard", mode='standard', val_ratio=0.1, seed=DEFAULT_SEED)

print("\n" + "="*60)
print(f"{'Model Name':<20} | {'Final Test Acc':<15} | {'Avg Acc'}")
print("-" * 60)
print(f"{'Combined (BUS)':<20} | {acc_orig:.4f}          | {avg_orig:.4f}")
print(f"{'Conf Only':<20} | {acc_conf:.4f}          | {avg_conf:.4f}")
print(f"{'Class Only':<20} | {acc_class:.4f}          | {avg_class:.4f}")
print(f"{'Standard (Val)':<20} | {acc_std:.4f}          | {avg_std:.4f}")
print("="*60)

In [None]:
# CELL 4: CONFUSION MATRICES PLOTTING
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def plot_cm(model, title, ax=None):
    # 1. Load Data 
    args = Args(mode='combined', val_ratio=0.0, seed=42)
    tokenizer = BertTokenizer.from_pretrained(args.pt_model_checkpoint)
    data = load_local_data(args.task, args.data_path, tokenizer, args.max_seq_length, args.val_ratio)

    # 2. Predict
    y_pred_logits = model.predict(data['x_test'], batch_size=args.batch_size, verbose=0)
    y_pred = np.argmax(y_pred_logits, axis=1)
    y_true = data['y_test']

    # 3. Plot
    cm = confusion_matrix(y_true, y_pred)
    if ax is None: fig, ax = plt.subplots(figsize=(5, 4))

    # Heatmap 
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'],
                ax=ax, cbar=False)

    # Axis Labels & Title
    ax.set_title(f'{title}\nAccuracy: {np.mean(y_true == y_pred):.1%}', fontsize=12, fontweight='bold')
    ax.set_ylabel('True Label', fontsize=10)
    ax.set_xlabel('Predicted Label', fontsize=10)

# Setup Grid
fig, axes = plt.subplots(1, 4, figsize=(24, 5))

# Plot all 4 models
plot_cm(model_combined, "Combined (BUS)", ax=axes[0])
plot_cm(model_conf, "Conf Only", ax=axes[1])
plot_cm(model_class, "Class Only", ax=axes[2])
plot_cm(model_std, "Standard (Val)", ax=axes[3])

plt.tight_layout()
plt.show()

In [None]:
# CELL 5: INDIVIDUAL TRAINING PLOTS (PLOT 1: ACCURACY, PLOT 2: STOP METRIC, PLOT 3: SAVE METRIC)
import matplotlib.pyplot as plt

def plot_individual_model(history, model_name, is_standard=False):
    if not history or 'accuracy' not in history: return
    epochs = range(1, len(history['accuracy']) + 1)

    # Create a figure with 3 side-by-side plots
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # PLOT 1: Accuracy 
    axes[0].plot(epochs, history['accuracy'], 'b-o', linewidth=2, label='Test Accuracy')
    axes[0].set_title(f"{model_name}: Accuracy Curve", fontweight='bold')
    axes[0].set_xlabel("Epochs")
    axes[0].set_ylabel("Accuracy")
    axes[0].set_ylim(0.4, 1.0)
    axes[0].grid(True, alpha=0.3)
    axes[0].legend()

    # PLOT 2: Stop Metric (The Brake) 
    metric_label = "Val Loss" if is_standard else "S_conf (Stability)"
    color = "red"

    axes[1].plot(epochs, history['stop_metric'], color=color, linestyle='--', marker='s', label=metric_label)
    axes[1].set_title(f"Stop Metric (Lower is Better)", fontweight='bold')
    axes[1].set_xlabel("Epochs")
    axes[1].set_ylabel("Metric Value")
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)

    # PLOT 3: Save Metric (The Judge)
    save_label = "Val Loss" if is_standard else "S_class (Distribution)"
    save_color = "purple" if is_standard else "green"

    axes[2].plot(epochs, history['save_metric'], color=save_color, linestyle='-.', marker='^', label=save_label)
    axes[2].set_title(f"Save Metric (Selection Criteria)", fontweight='bold')
    axes[2].set_xlabel("Epochs")
    axes[2].set_ylabel("Metric Value")
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)

    plt.suptitle(f"Training Dynamics: {model_name}", fontsize=16, y=1.05)
    plt.tight_layout()
    plt.show()

print("GENERATING LABELED PLOTS...")

if 'hist_orig' in locals(): plot_individual_model(hist_orig, "Combined (BUS)")
if 'hist_conf' in locals(): plot_individual_model(hist_conf, "Confidence Only")
if 'hist_class' in locals(): plot_individual_model(hist_class, "Class Only")
if 'hist_std' in locals(): plot_individual_model(hist_std, "Standard Baseline", is_standard=True)

In [None]:
# CELL 6: ROBUSTNESS CHECK WITH SEED = 35
SEED_CHECK = 35

print(f"\nSTARTING ROBUSTNESS CHECK WITH SEED {SEED_CHECK}...")

print("1. Training Combined...")
acc_orig_2, _, _, _ = train_engine("Combined", mode='combined', seed=SEED_CHECK)
print("2. Training Conf Only...")
acc_conf_2, _, _, _ = train_engine("Conf Only", mode='conf', seed=SEED_CHECK)
print("3. Training Class Only...")
acc_class_2, _, _, _ = train_engine("Class Only", mode='class', seed=SEED_CHECK)

print("\n" + "="*40); print(f"RESULTS FOR SEED {SEED_CHECK}"); print("="*40)
print(f"{'Combined':<15} | {acc_orig_2:.4f}")
print(f"{'Conf Only':<15} | {acc_conf_2:.4f}")
print(f"{'Class Only':<15} | {acc_class_2:.4f}")