In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m


In [4]:
# Install necessary libraries

import pandas as pd
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Load the datasets
test_df = pd.read_csv('/content/drive/MyDrive/Sharetask_datasets/sarcasm_tam_test_without_labels.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/Sharetask_datasets/sarcasm_tam_dev.csv')
train_df = pd.read_csv('/content/drive/MyDrive/Sharetask_datasets/sarcasm_tam_train.csv')

# Preprocess the data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s!?.,]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['cleaned_text'] = train_df['Text'].apply(preprocess_text)
dev_df['cleaned_text'] = dev_df['Text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['Text'].apply(preprocess_text)

# Map labels to binary values
train_df['labels'] = train_df['labels'].apply(lambda x: 1 if x == 'Sarcastic' else 0)
dev_df['labels'] = dev_df['labels'].apply(lambda x: 1 if x == 'Sarcastic' else 0)

# Create datasets
train_dataset = Dataset.from_pandas(train_df[['cleaned_text', 'labels']])
dev_dataset = Dataset.from_pandas(dev_df[['cleaned_text', 'labels']])
test_dataset = Dataset.from_pandas(test_df[['cleaned_text']])

# Combine datasets
dataset_dict = DatasetDict({
    'train': train_dataset,
    'dev': dev_dataset,
    'test': test_dataset
})

# Load the tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['cleaned_text'], padding="max_length", truncation=True)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Disable reporting to Weights & Biases (or other)
)

# Define the evaluation metrics
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['dev'],
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

print(f"Development Accuracy: {eval_result['eval_accuracy']:.2f}")
print(f"Development F1 Score: {eval_result['eval_f1']:.2f}")

# Predict on the test set
predictions = trainer.predict(tokenized_datasets['test'])
test_df['predicted_labels'] = predictions.predictions.argmax(-1)
test_df['predicted_labels'] = test_df['predicted_labels'].apply(lambda x: 'Sarcastic' if x == 1 else 'Non-sarcastic')

# Save test set predictions to CSV
test_df[['Text', 'predicted_labels']].to_csv('test_predictions.csv', index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/29570 [00:00<?, ? examples/s]

Map:   0%|          | 0/6336 [00:00<?, ? examples/s]

Map:   0%|          | 0/6338 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4327,0.456417,0.796086,0.794139,0.792617,0.796086
2,0.4567,0.453494,0.799874,0.786763,0.788187,0.799874


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4327,0.456417,0.796086,0.794139,0.792617,0.796086
2,0.4567,0.453494,0.799874,0.786763,0.788187,0.799874
3,0.3695,0.513651,0.804924,0.799683,0.797483,0.804924


Development Accuracy: 0.80
Development F1 Score: 0.80
