In [1]:
pip install accelerate -U



In [2]:
!pip install evaluate



In [3]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import csv
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

In [4]:
%cd sample_data

/content/sample_data


In [5]:
def convert_to_hugging_face_dataset(tsv_path):
    messages = []
    labels = []

    with open(tsv_path, 'r', newline='', encoding='utf8') as tsv_file:
        reader = csv.reader(tsv_file, delimiter='\t')

        for label, message in reader:
            messages.append(message)
            labels.append(int(label))  # Convert label to integer

    dataset = Dataset.from_dict({"label": labels, "text": messages})
    return dataset


In [6]:
train_data = convert_to_hugging_face_dataset('train_data.tsv')
test_data = convert_to_hugging_face_dataset('test_data.tsv')

In [7]:
train_data[0]

{'label': 0,
 'text': "TAKE THIS COURSE!! No matter what major you are, it's an extremely insightful and fun course! The structure of the course and the way it's taught is extremely effective. As someone who had never considered computer science in college, I'm not considering a major in computer science and taking more computer science courses at Stanford in the future."}

In [8]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
accuracy = evaluate.load("accuracy")

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.854455,0.630872
2,No log,0.752257,0.738255


TrainOutput(global_step=146, training_loss=0.8352494435767605, metrics={'train_runtime': 53.6234, 'train_samples_per_second': 43.19, 'train_steps_per_second': 2.723, 'total_flos': 124254048540492.0, 'train_loss': 0.8352494435767605, 'epoch': 2.0})

In [18]:
evaluation_result = trainer.evaluate()
print("Evaluation Result:", evaluation_result)

Evaluation Result: {'eval_loss': 0.7522569894790649, 'eval_accuracy': 0.738255033557047, 'eval_runtime': 0.8475, 'eval_samples_per_second': 175.819, 'eval_steps_per_second': 11.8, 'epoch': 2.0}
