## Import libraries + environment setup

In [24]:
import sys, os
sys.path.append("../src")
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from transformers import TrainingArguments, Trainer
from model import ReviewClassifier

## Import data

In [25]:
df = pd.read_csv("../data/reviews.tsv", sep="\t")

## Train test split

In [26]:
train_df, temp_df = train_test_split(
    df,
    test_size=0.25,
    random_state=42,
    stratify=df["constructive"],
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    stratify=temp_df["constructive"],
)

print(f"Training set size: {len(train_df)}\nValidation set size: {len(val_df)}\nTest set size: {len(test_df)}")


Training set size: 612
Validation set size: 102
Test set size: 102


## Loading the model

In [27]:
rc = ReviewClassifier("distilbert-base-uncased", num_labels=2)
tokenizer = rc.tokenizer
model = rc.model

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Converting a dataset format for Huggingface

In [28]:
train_ds_tok = rc.df_to_dataset(train_df)
val_ds_tok   = rc.df_to_dataset(val_df)
test_ds_tok  = rc.df_to_dataset(test_df)

Map: 100%|██████████| 612/612 [00:00<00:00, 3163.92 examples/s]
Map: 100%|██████████| 102/102 [00:00<00:00, 3593.82 examples/s]
Map: 100%|██████████| 102/102 [00:00<00:00, 3360.71 examples/s]


In [29]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

training_args = TrainingArguments(
    output_dir="../models/review_classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tok,
    eval_dataset=val_ds_tok,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

## Training

In [30]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.542963,0.754902,0.719101
2,0.597400,0.436628,0.794118,0.796117
3,0.437400,0.429604,0.803922,0.8




TrainOutput(global_step=117, training_loss=0.4945661756727431, metrics={'train_runtime': 62.7569, 'train_samples_per_second': 29.256, 'train_steps_per_second': 1.864, 'total_flos': 121605071966208.0, 'train_loss': 0.4945661756727431, 'epoch': 3.0})

In [34]:
test_metrics = trainer.evaluate(test_ds_tok)
print("\n=== FINAL TEST SET RESULTS ===")
print(f"Loss:      {test_metrics['eval_loss']:.4f}")
print(f"Accuracy:  {test_metrics['eval_accuracy']:.4f}")
print(f"F1 Score:  {test_metrics['eval_f1']:.4f}")
print("==============================\n")





=== FINAL TEST SET RESULTS ===
Loss:      0.3405
Accuracy:  0.8824
F1 Score:  0.8800



In [32]:
raw_preds = trainer.predict(test_ds_tok)
y_pred = raw_preds.predictions.argmax(axis=-1)
y_true = raw_preds.label_ids

print("\nFinal Test Confusion Matrix:")
confusion_matrix(y_true, y_pred)




Final Test Confusion Matrix:


array([[46,  5],
       [ 7, 44]])

In [33]:
trainer.save_model("../models/review_classifier")
tokenizer.save_pretrained("../models/review_classifier")

('../models/review_classifier/tokenizer_config.json',
 '../models/review_classifier/special_tokens_map.json',
 '../models/review_classifier/vocab.txt',
 '../models/review_classifier/added_tokens.json',
 '../models/review_classifier/tokenizer.json')