In [1]:
import pandas as pd
from datasets import DatasetDict, Dataset, concatenate_datasets, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("sarcasm_classifications_labeled.csv")

label_column = "labels"  

print(df)

train_df, testval_df = train_test_split(
    df, 
    test_size=0.3, 
    random_state=42,
    stratify=df[label_column]
)


val_df, test_df = train_test_split(
    testval_df,
    test_size=0.5,
    random_state=42,
    stratify=testval_df[label_column]
)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

     Unnamed: 0  labels                                           headline
0           620       0                         side effects sound awesome
1          3523       1                       nation ready for its din din
2          8540       1  pope francis wearing sweater vestments he got ...
3          8530       1  nbc unveils on screen graphic informing audien...
4          5747       0    child baffled by stationary, non-violent images
..          ...     ...                                                ...
595        3377       0  stan lee, creator of beloved marvel character ...
596        5823       0  pfizer mercifully puts down another batch of t...
597       10116       0  cern researchers apologize for destruction of ...
598        4870       0  'just take it slow, and you'll be fine,' drunk...
599         854       0  vince gilligan's brain spoils final season of ...

[600 rows x 3 columns]


In [3]:
model_path = 'google-bert/bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_path)

def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["headline"], truncation=True)
    return tokenized_inputs

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
f1_score = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    
    f1_metrics = f1_score.compute(predictions=predictions, references=labels, average="macro")
    
    # Combine metrics into a single dictionary
    metrics = {"accuracy": accuracy_score["accuracy"], "f1": f1_metrics["f1"]}
    
    return metrics

In [4]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
)



In [5]:
def semi_supervised_training_loop(
    tokenizer, 
    dataset_dict,
    unlabeled_dataset,
    original_df,
    num_iterations=5,
    confidence_threshold=0.8,
):
    for iteration in range(num_iterations):
        print(f"\n=== Iteration {iteration+1}/{num_iterations} ===")
        print(f"Training set size: {len(dataset_dict['train'])}")
        print(f"Unlabeled pool size: {len(unlabeled_dataset)}")

        model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)
        
        # Tokenize datasets
        tokenized_data = dataset_dict.map(preprocess_function, batched=True)
        
        # Train model
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_data['train'],
            eval_dataset=tokenized_data['validation'],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )
        
        trainer.train()

        # Get metrics on test set
        predictions = trainer.predict(tokenized_data["test"])

        logits = predictions.predictions
        labels = predictions.label_ids

        metrics = compute_metrics((logits, labels))
        print(metrics)
        
        # If no unlabeled data left, break
        if len(unlabeled_dataset) == 0:
            break
        
        # Tokenize unlabeled data
        tokenized_unlabeled = unlabeled_dataset.map(preprocess_function, batched=True)
        
        # Get predictions
        outputs = trainer.predict(tokenized_unlabeled)
        logits = outputs.predictions
        probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
        confidence = np.max(probs, axis=1)
        predictions = np.argmax(logits, axis=1)
        
        # Select high confidence samples
        high_conf_indices = np.where(confidence >= confidence_threshold)[0]
        
        if len(high_conf_indices) == 0:
            print("No high confidence predictions found. Try lowering the threshold.")
            break
        
        print(f"Found {len(high_conf_indices)} high confidence predictions")
        
        # Add high confidence samples to training set
        new_labeled_samples = unlabeled_dataset.select(high_conf_indices)
        
        # Remove selected samples from unlabeled pool
        remaining_indices = [i for i in range(len(unlabeled_dataset)) if i not in high_conf_indices]
        unlabeled_dataset = unlabeled_dataset.select(remaining_indices)

        new_samples_df = new_labeled_samples.to_pandas()
        new_samples_df['labels'] = predictions[high_conf_indices]

        original_df = pd.concat([original_df, new_samples_df], ignore_index=True)

        print(original_df)

        train_df, testval_df = train_test_split(
            original_df, 
            test_size=0.3, 
            random_state=42,
        )


        val_df, test_df = train_test_split(
            testval_df,
            test_size=0.5,
            random_state=42,
        )

        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        test_dataset = Dataset.from_pandas(test_df)

        dataset_dict = DatasetDict({
            'train': train_dataset,
            'validation': val_dataset,
            'test': test_dataset
        })
        
        print(f"Added {len(new_labeled_samples)} samples to training set")
        print(f"New training set size: {len(dataset_dict['train'])}")
        print(f"Remaining unlabeled pool: {len(unlabeled_dataset)}")
    
    return dataset_dict, unlabeled_dataset, model

unlabeled_dataset = load_dataset("csv", data_files="sarcasm_classifications.csv")["train"]

# Run the loop
final_dataset_dict, remaining_unlabeled, final_model = semi_supervised_training_loop(
    tokenizer=tokenizer,
    dataset_dict=dataset_dict,
    unlabeled_dataset=unlabeled_dataset,
    original_df=df
)


=== Iteration 1/5 ===
Training set size: 420
Unlabeled pool size: 13034


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 420/420 [00:00<00:00, 41480.83 examples/s]
Map: 100%|██████████| 90/90 [00:00<00:00, 33450.36 examples/s]
Map: 100%|██████████| 90/90 [00:00<00:00, 30004.56 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.046639,0.466667,0.373794
2,No log,0.953425,0.544444,0.499744
3,No log,0.885301,0.633333,0.625022
4,No log,0.797535,0.655556,0.655556
5,No log,0.801717,0.633333,0.636598
6,No log,0.832565,0.677778,0.676282
7,No log,0.81147,0.711111,0.712248
8,No log,0.878357,0.677778,0.684969
9,No log,0.880152,0.7,0.703133


{'accuracy': 0.6555555555555556, 'f1': 0.6525573192239859}


Found 7521 high confidence predictions
      Unnamed: 0  labels                                           headline
0            620       0                         side effects sound awesome
1           3523       1                       nation ready for its din din
2           8540       1  pope francis wearing sweater vestments he got ...
3           8530       1  nbc unveils on screen graphic informing audien...
4           5747       0    child baffled by stationary, non-violent images
...          ...     ...                                                ...
8116       13619       2  clinton reminds new yorkers she moved there ho...
8117       13620       2  senate: 'renewed fisa legislation imperative i...
8118       13624       2  'entertainment weekly' critic lets director re...
8119       13627       2  congressman picked last for committee on youth...
8120       13628       1            grandmother doesn't care for new priest

[8121 rows x 3 columns]
Added 7521 samples to tr

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 5684/5684 [00:00<00:00, 73463.20 examples/s]
Map: 100%|██████████| 1218/1218 [00:00<00:00, 73302.37 examples/s]
Map: 100%|██████████| 1219/1219 [00:00<00:00, 69522.00 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.225276,0.924466,0.917757
2,0.503200,0.242558,0.934319,0.928916
3,0.140900,0.248381,0.946634,0.941155
4,0.140900,0.289636,0.946634,0.942121
5,0.056100,0.301681,0.950739,0.946163
6,0.029400,0.312889,0.953202,0.949351
7,0.029400,0.35706,0.947455,0.942823
8,0.012600,0.337202,0.95156,0.947201


{'accuracy': 0.9474979491386383, 'f1': 0.9462882260066398}


Map: 100%|██████████| 5513/5513 [00:00<00:00, 34747.85 examples/s]


Found 5224 high confidence predictions
       Unnamed: 0  labels                                           headline
0             620       0                         side effects sound awesome
1            3523       1                       nation ready for its din din
2            8540       1  pope francis wearing sweater vestments he got ...
3            8530       1  nbc unveils on screen graphic informing audien...
4            5747       0    child baffled by stationary, non-violent images
...           ...     ...                                                ...
13340       13626       1                area eyesore also a data technician
13341       13629       0  polish rapper under fire for use of the word '...
13342       13630       0       jews to celebrate rosh hashasha or something
13343       13632       0  mars probe destroyed by orbiting spielberg-gat...
13344       13633       1                 dad clarifies this not a food stop

[13345 rows x 3 columns]
Added 5224 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 9341/9341 [00:00<00:00, 76685.63 examples/s]
Map: 100%|██████████| 2002/2002 [00:00<00:00, 8042.43 examples/s]
Map: 100%|██████████| 2002/2002 [00:00<00:00, 68417.34 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7183,0.304761,0.888112,0.885349
2,0.3051,0.335838,0.890609,0.889663
3,0.1705,0.435092,0.888112,0.88838
4,0.1208,0.534725,0.889111,0.888234


{'accuracy': 0.8881118881118881, 'f1': 0.8881503178690875}


Map: 100%|██████████| 289/289 [00:00<00:00, 11718.65 examples/s]


Found 198 high confidence predictions
       Unnamed: 0  labels                                           headline
0             620       0                         side effects sound awesome
1            3523       1                       nation ready for its din din
2            8540       1  pope francis wearing sweater vestments he got ...
3            8530       1  nbc unveils on screen graphic informing audien...
4            5747       0    child baffled by stationary, non-violent images
...           ...     ...                                                ...
13538       13514       1       anteater to lay off the fire ants for awhile
13539       13539       1  single woman has facebook profile picture with...
13540       13549       2  houston residents begin surveying damage of 20...
13541       13559       0  sweatshirt string emerges triumphant after har...
13542       13579       2  tyson holds contest to let fans submit new ide...

[13543 rows x 3 columns]
Added 198 sa

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 9480/9480 [00:00<00:00, 76514.09 examples/s]
Map: 100%|██████████| 2031/2031 [00:00<00:00, 69504.10 examples/s]
Map: 100%|██████████| 2032/2032 [00:00<00:00, 72027.73 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6852,0.37937,0.854751,0.856939
2,0.3028,0.361459,0.883801,0.883069
3,0.1856,0.468901,0.879862,0.879319
4,0.1134,0.477799,0.899557,0.899165
5,0.0864,0.507483,0.901034,0.900973
6,0.0282,0.650021,0.894633,0.894475
7,0.0232,0.60508,0.89611,0.89545


{'accuracy': 0.9015748031496063, 'f1': 0.9016881896522877}


Map: 100%|██████████| 91/91 [00:00<00:00, 4339.91 examples/s]


Found 79 high confidence predictions
       Unnamed: 0  labels                                           headline
0             620       0                         side effects sound awesome
1            3523       1                       nation ready for its din din
2            8540       1  pope francis wearing sweater vestments he got ...
3            8530       1  nbc unveils on screen graphic informing audien...
4            5747       0    child baffled by stationary, non-violent images
...           ...     ...                                                ...
13617       12691       2  epa urges flint residents to stop dumping tap ...
13618       12927       0           jay-z gives shout-out to his shareholdaz
13619       12992       2  everyone who started watching 'mad money' in 2...
13620       13402       1  school principal pauses for applause that neve...
13621       13522       1  rapper not entirely sure who else is on this t...

[13622 rows x 3 columns]
Added 79 samp

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 9535/9535 [00:00<00:00, 65950.02 examples/s]
Map: 100%|██████████| 2043/2043 [00:00<00:00, 64196.13 examples/s]
Map: 100%|██████████| 2044/2044 [00:00<00:00, 65783.92 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7488,0.333233,0.881547,0.880382
2,0.3146,0.312645,0.886442,0.885246
3,0.2072,0.399133,0.902594,0.901277
4,0.1292,0.448541,0.910915,0.909386
5,0.084,0.519611,0.902594,0.9015
6,0.0306,0.568793,0.907978,0.90532


{'accuracy': 0.8977495107632094, 'f1': 0.8982259244143106}


Map: 100%|██████████| 12/12 [00:00<00:00, 799.97 examples/s]


Found 11 high confidence predictions
       Unnamed: 0  labels                                           headline
0             620       0                         side effects sound awesome
1            3523       1                       nation ready for its din din
2            8540       1  pope francis wearing sweater vestments he got ...
3            8530       1  nbc unveils on screen graphic informing audien...
4            5747       0    child baffled by stationary, non-violent images
...           ...     ...                                                ...
13628        5183       2  houghton mifflin harcourt releases new leather...
13629        5273       0  'that's it? what the heck was that?' says dad ...
13630        6804       2  new hampshire passes law forcing old people to...
13631        8279       0  special pull-out section: rural illinois' sexi...
13632        8612       2  'okay, gene, let's just get through this,' mar...

[13633 rows x 3 columns]
Added 11 samp

In [10]:
final_dataset_dict['train'].to_csv('semi_supervised_annotation/results_train.csv')

Creating CSV from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 624.98ba/s]


761635

In [11]:
final_dataset_dict['validation'].to_csv('semi_supervised_annotation/results_validation.csv')

Creating CSV from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 750.46ba/s]


164051

In [12]:
final_dataset_dict['test'].to_csv('semi_supervised_annotation/results_test.csv')

Creating CSV from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 750.10ba/s]


164026