In [1]:
import pandas as pd

# Load the dataset
train_df = pd.read_csv("/content/test.tsv", sep='\t', header=None)
test_df = pd.read_csv("/content/test.tsv", sep='\t', header=None)
val_df = pd.read_csv("/content/test.tsv", sep='\t', header=None)

# Extract only the necessary columns: Column 2 (label) and Column 3 (statement)
train_df = train_df[[1, 2]]  # 1 is label, 2 is statement
test_df = test_df[[1, 2]]
val_df = val_df[[1, 2]]

# Rename the columns for convenience
train_df.columns = ['label', 'statement']
test_df.columns = ['label', 'statement']
val_df.columns = ['label', 'statement']

train_df.head()
test_df.head()
val_df.head()

Unnamed: 0,label,statement
0,true,Building a wall on the U.S.-Mexico border will...
1,false,Wisconsin is on pace to double the number of l...
2,false,Says John McCain has done nothing to help the ...
3,half-true,Suzanne Bonamici supports a plan that will cut...
4,pants-fire,When asked by a reporter whether hes at the ce...


In [2]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the labels in the training dataset
train_df['label'] = label_encoder.fit_transform(train_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])
val_df['label'] = label_encoder.transform(val_df['label'])

# Get the classes that have been encoded
classes = label_encoder.classes_

# Print the mapping of each label to its encoded value
print("Label Encoding Mapping:")
for index, label in enumerate(classes):
    print(f"'{label}' -> {index}")


Label Encoding Mapping:
'barely-true' -> 0
'false' -> 1
'half-true' -> 2
'mostly-true' -> 3
'pants-fire' -> 4
'true' -> 5


In [3]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize the input
def tokenize_data(df):
    return tokenizer(
        df['statement'].tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# Tokenize the data
train_encodings = tokenize_data(train_df)
test_encodings = tokenize_data(test_df)
val_encodings = tokenize_data(val_df)

# Convert labels to a tensor format
import torch

train_labels = torch.tensor(train_df['label'].values)
test_labels = torch.tensor(test_df['label'].values)
val_labels = torch.tensor(val_df['label'].values)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [4]:
# Check the structure of tokenized data
print("Train Encodings Keys: ", train_encodings.keys())

# Check the shapes of inputs (input_ids), attention masks, and token type ids
print("Train Input IDs Shape: ", train_encodings['input_ids'].shape)
print("Train Attention Mask Shape: ", train_encodings['attention_mask'].shape)

# Check a sample of input tokens and attention masks
print("Sample Train Input IDs: ", train_encodings['input_ids'][0])  # Sample tokenized input (first sentence)
print("Sample Train Attention Mask: ", train_encodings['attention_mask'][0])  # Attention mask (first sentence)

Train Encodings Keys:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Train Input IDs Shape:  torch.Size([1267, 128])
Train Attention Mask Shape:  torch.Size([1267, 128])
Sample Train Input IDs:  tensor([ 101, 2311, 1037, 2813, 2006, 1996, 1057, 1012, 1055, 1012, 1011, 3290,
        3675, 2097, 2202, 6719, 2086, 1012,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   

In [5]:
from torch.utils.data import Dataset

class LIARDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = LIARDataset(train_encodings, train_labels)
test_dataset = LIARDataset(test_encodings, test_labels)
val_dataset = LIARDataset(val_encodings, val_labels)

In [6]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT model for sequence classification with the correct number of labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import TrainerCallback, ProgressCallback
from tqdm.auto import tqdm

class ProgressBarCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.epochs_bar = None
        self.steps_bar = None

    def on_train_begin(self, args, state, control, **kwargs):
        # Initialize the progress bar for epochs
        self.epochs_bar = tqdm(total=args.num_train_epochs, desc="Epochs", position=0, leave=True)

    def on_epoch_begin(self, args, state, control, **kwargs):
        # Initialize the progress bar for steps in each epoch
        total_steps = len(kwargs['train_dataloader'])
        self.steps_bar = tqdm(total=total_steps, desc="Steps", position=1, leave=False)

    def on_step_end(self, args, state, control, **kwargs):
        # Update the steps progress bar after each step
        self.steps_bar.update(1)

    def on_epoch_end(self, args, state, control, **kwargs):
        # Update the epochs progress bar after each epoch
        self.epochs_bar.update(1)
        self.steps_bar.close()  # Close the step progress bar at the end of the epoch

    def on_train_end(self, args, state, control, **kwargs):
        self.epochs_bar.close()  # Close the epoch progress bar when training ends


In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score
import numpy as np

# Define a function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Define training arguments with some optimizations turned off for improved accuracy
training_args = TrainingArguments(
    output_dir='./results',                 # Output directory
    num_train_epochs=6,                     # Number of training epochs
    per_device_train_batch_size=8,          # Reduced batch size for potentially better accuracy
    per_device_eval_batch_size=32,          # Reduced eval batch size
    gradient_accumulation_steps=1,          # No gradient accumulation
    warmup_steps=500,                       # Warmup steps for learning rate scheduler
    weight_decay=0.01,                      # Strength of weight decay
    logging_dir='./logs',                   # Directory for storing logs
    logging_steps=50,                       # Log more frequently to monitor training
    fp16=False,                             # Disable mixed precision for accuracy improvement
    eval_strategy="epoch",                  # Evaluate at the end of every epoch
    save_strategy="epoch",                  # Save the model at the end of every epoch
    save_total_limit=2,                     # Limit the number of saved models
    load_best_model_at_end=True,            # Load the best model when finished
    dataloader_num_workers=2                # Set the number of workers to 2
)

# Define the Trainer with Early Stopping and Progress Bar Callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,        # Compute accuracy during evaluation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2), ProgressBarCallback()]  # Early stopping with patience of 2
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate(eval_dataset=val_dataset)
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epochs:   0%|          | 0/6 [00:00<?, ?it/s]

Steps:   0%|          | 0/159 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2944,1.201478,0.584057
2,1.2308,0.932847,0.677979
3,1.0913,0.679418,0.808208
4,0.723,0.264556,0.947908
5,0.2577,0.050562,0.995264
6,0.0448,0.015683,0.999211


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Steps:   0%|          | 0/159 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Steps:   0%|          | 0/159 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])


Steps:   0%|          | 0/159 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Steps:   0%|          | 0/159 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])


Steps:   0%|          | 0/159 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Accuracy: 0.9992


In [None]:
# Make predictions on the test dataset
predictions, labels, metrics = trainer.predict(test_dataset)

# Convert logits to predicted labels
predicted_labels = np.argmax(predictions, axis=-1)

# Calculate accuracy on the test dataset
accuracy = accuracy_score(labels, predicted_labels)
print(f"Accuracy on Test Dataset: {accuracy:.4f}")

# If you want to print the full metrics
print(metrics)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Accuracy on Test Dataset: 0.9992
{'test_loss': 0.015683207660913467, 'test_accuracy': 0.9992107340173638, 'test_runtime': 2.555, 'test_samples_per_second': 495.896, 'test_steps_per_second': 15.656}


In [None]:
# Save the model as a .pth file
model_save_path = './saved_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to ./saved_model.pth


In [None]:
# Load the saved model
loaded_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)  # Adjust 'num_labels' as per your dataset
loaded_model.load_state_dict(torch.load(model_save_path))
loaded_model.eval()  # Set the model to evaluation mode
print("Model loaded from saved .pth file.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  loaded_model.load_state_dict(torch.load(model_save_path))


Model loaded from saved .pth file.


In [None]:
# Display the model architecture
print(loaded_model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Load the saved model
model_save_path = './saved_model.pth'  # Path to your saved model
loaded_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)  # 6 classes as per your label encoding
loaded_model.load_state_dict(torch.load(model_save_path))
loaded_model.eval()  # Set the model to evaluation mode
print("Model loaded from saved .pth file.")

# Define the label encoding mapping
label_mapping = {
    0: 'barely-true',
    1: 'false',
    2: 'half-true',
    3: 'mostly-true',
    4: 'pants-fire',
    5: 'true'
}

# Print all the possible labels
print("Label Encoding Mapping:")
for label_id, label_name in label_mapping.items():
    print(f"{label_id}: {label_name}")

# Example input for testing
input_sentence = ""

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input
inputs = tokenizer(
    input_sentence,
    return_tensors="pt",  # Return as PyTorch tensors
    padding=True,         # Pad the input to the max length (if needed)
    truncation=True,      # Truncate if the sentence exceeds the max length
    max_length=128        # Set a max length for the input
)

# Pass the tokenized input to the loaded model
outputs = loaded_model(**inputs)

# Get the logits (raw predictions)
logits = outputs.logits

# Apply softmax to get probabilities
probabilities = torch.nn.functional.softmax(logits, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probabilities, dim=-1).item()

# Get the label based on the predicted class
predicted_label = label_mapping[predicted_class]

# Get the confidence (probability) of the predicted class
confidence_score = probabilities[0][predicted_class].item()

# Print the result
print(f"\nPredicted Label: {predicted_label}")
print(f"Confidence: {confidence_score * 100:.2f}%")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  loaded_model.load_state_dict(torch.load(model_save_path))


Model loaded from saved .pth file.
Label Encoding Mapping:
0: barely-true
1: false
2: half-true
3: mostly-true
4: pants-fire
5: true

Predicted Label: barely-true
Confidence: 97.05%
