In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load the dataset
url = "https://zenodo.org/record/3609356/files/crowdsourced.csv?download=1"
df = pd.read_csv(url)

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Preprocessing: Tokenize using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_and_encode(texts):
    return tokenizer.batch_encode_plus(
        texts,
        max_length=256,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

X_train = tokenize_and_encode(train_df["Text"].tolist())
X_test = tokenize_and_encode(test_df["Text"].tolist())

y_train = torch.tensor(train_df["Verdict"].tolist()).add(1)  # Add 1 to shift labels from [-1, 0, 1] to [0, 1, 2]
y_test = torch.tensor(test_df["Verdict"].tolist()).add(1)

# Create a PyTorch dataset
class CrowdsourcedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CrowdsourcedDataset(X_train, y_train)
test_dataset = CrowdsourcedDataset(X_test, y_test)

# Instantiate the BERT model
model = BertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Define the Trainer and TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the BERT classifier
trainer.train()

# Test and evaluate the classifier
y_pred = trainer.predict(test_dataset).predictions.argmax(axis=-1)

# Shift labels back to original range [-1, 0, 1]
y_pred = y_pred - 1
y_test = y_test - 1

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("Classification Report:")
print(classification_report(y_test, y_pred))


  from .autonotebook import tqdm as notebook_tqdm
2023-04-18 10:11:55.095290: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertForSequenceClassification: ['vocab_transfo

Epoch,Training Loss,Validation Loss


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./results/checkpoint-1000
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 4501
  Batch size = 16
***** Running Evaluation *****
  Num examples = 4501
  Batch size = 16
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./re

Accuracy: 75.61%
Classification Report:
              precision    recall  f1-score   support

          -1       0.80      0.92      0.86      2926
           0       0.00      0.00      0.00       502
           1       0.61      0.66      0.64      1073

    accuracy                           0.76      4501
   macro avg       0.47      0.53      0.50      4501
weighted avg       0.67      0.76      0.71      4501



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
