In [1]:
# Install necessary libraries
!pip install torch transformers scikit-learn pandas tqdm


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive


In [3]:
# Load dataset from Google Drive
train = pd.read_csv('/content/Train_Dataset.csv')
test = pd.read_csv('/content/Test_Dataset.csv')


In [6]:
print("Train dataset columns:", train.columns)
print("Test dataset columns:", test.columns)


Train dataset columns: Index(['tweet', 'sarcastic'], dtype='object')
Test dataset columns: Index(['tweet', 'sarcastic'], dtype='object')


In [7]:
# Extract input texts and labels
X_train, y_train = train['tweet'].tolist(), train['sarcastic'].tolist()
X_test, y_test = test['tweet'].tolist(), test['sarcastic'].tolist()


In [8]:
# Define model and tokenizer
MODEL = 'xlnet-base-cased'
tokenizer = XLNetTokenizer.from_pretrained(MODEL)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [9]:
# Tokenize input data
train_encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [18]:
# Create dataset class
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare dataset objects
train_dataset = SarcasmDataset(train_encodings, y_train)
test_dataset = SarcasmDataset(test_encodings, y_test)


In [19]:
# Define evaluation metrics
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    return {"accuracy": accuracy_score(labels, pred), "f1_score": f1_score(labels, pred)}


In [20]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./res',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs4',
    report_to="none"
)


In [21]:
# Load model
model = XLNetForSequenceClassification.from_pretrained(MODEL)


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)


In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [24]:
# Train the model
trainer.train()


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
500,0.5536
1000,0.2059


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=1085, training_loss=0.35318503951147406, metrics={'train_runtime': 278.3581, 'train_samples_per_second': 124.552, 'train_steps_per_second': 3.898, 'total_flos': 2662106059190160.0, 'train_loss': 0.35318503951147406, 'epoch': 5.0})

In [25]:
# Evaluate the model
preds = trainer.predict(test_dataset)
preds = np.argmax(preds.predictions[:, 0:2], axis=-1)
print("F1 Score:", f1_score(y_test, preds))


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


F1 Score: 0.3015267175572519
