# Project 2.3: Sentiment classification with Distil BERT

Standard `python` imports:

In [1]:
import logging; logging.basicConfig(level = logging.INFO)
import os; os.environ["PYTORCHINDUCTOR_LOGLEVEL"] = "ERROR"
from pathlib import Path
import random
from typing import cast
import warnings; warnings.simplefilter(action = "ignore", category = UserWarning)

Imported `python` libraries:

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics
import torch; device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import datasets
import transformers

Global functions for book-keeping and other utilities.

In [3]:
def fix_seed(seed: int = 42):
	random.seed(seed)

	np.random.seed(seed)

	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)  # if using multi-GPU

	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False

	return seed


Helper path constants:

In [4]:
root = Path.cwd(); root.mkdir(
	parents = True,
	exist_ok = True,
)
models_path = root / "models"; models_path.mkdir(
	parents = True,
	exist_ok = True,
)

## Data

A customized HuggingFace dataset dictionary (`datasets.DatasetDict`) to the task for easy configuration for the pipeline. Has a single factory class method for loading all splits of our dataset, with the option to rename columns. The `test` split is (intentionally) missing the labels column.

In addition to loading the dataset splits, they are also preprocessed using a Distil BERT tokenizer, and formatted accordingly to turn them into tokenized/pipeline-ready datasets.

In [None]:
class TwitterDataset(datasets.DatasetDict):

	@classmethod
	def preprocessed(cls,
		model_name: str = "distilbert-base-uncased",
		root: Path = root,
		trim: int | None = None,
	**column_types: datasets.Value):

		if not column_types:
			column_types = dict(
				index  = datasets.Value(dtype = "int32" ),
				text   = datasets.Value(dtype = "string"),
				labels = datasets.Value(dtype = "int32" ),
			)

		features = datasets.Features(column_types)

		logging.info("Loading dataset...")

		dataset = cast(datasets.DatasetDict,
			datasets.load_dataset("csv",
				name = "Twitter",
				data_files = dict(
					train = str(root / "train_dataset.csv"),
					val   = str(root /   "val_dataset.csv"),
					test  = str(root /  "test_dataset.csv"),
				),
			)
		)

		logging.info("Dataset loaded.")
		logging.info("Renaming columns...")

		for split in dataset:
			columns = dict(zip(dataset[split].column_names, column_types))
			dataset[split] = dataset[split].rename_columns(columns).cast(features)

			if trim is not None:
				dataset[split] = dataset[split].select(range(trim))

		logging.info("Columns renamed.")
		logging.info("Processing dataset...")

		tokenizer = transformers.DistilBertTokenizer.from_pretrained(model_name)

		def tokenize(batch):
			return tokenizer(batch["text"],
				padding = "max_length",
				truncation = True,
			)

		dataset = dataset.map(tokenize)
		dataset.set_format(
			type = "torch",
			columns = [
				"input_ids",
				"attention_mask",
				"labels",
			],
		)
		dataset["test"] = dataset["test"].remove_columns("labels")

		logging.info("Dataset processed.")

		return cls(dataset)


## Classification pipeline

The entire classification pipeline is wrapped into a classifier class implementing the customary method set:

- `__init__`: Define core parameters of the classification pipeline.
- `compile`: Initialize all components of the pipeline preparing it for training/evaluation.
- `fit`: In effect the so-called training loop.
- `evaluate`: The evaluation loop. Only possible if a `val` split is available. Evaluation can never be performed on the `test` split, as it intentionally hides its ground truth (which is necessary for evaluation).
- `predict`: Raw methods used for inference from readable text to readable labels.

The classification pipeline is augmented to a context manager for using local pretrained models (along with their tokenizer) for it. If a previously saved model is found with the name, it is loaded instead. This is to avoid retraining every time something changes in hos evaluation is done.  Finally, there is a `submit` method for generating the expected `sumbission.csv` from the (unlabelled) `test` split. and a custom `plot` for visualization of the pipeline training and operation.

In [None]:
class TwitterClassifier:

	def __init__(self,
		model_name: str | Path = "distilbert-base-uncased",
		num_labels: int = 2,
	) -> None:
		self.trained = (path := models_path / model_name).exists()

		self.model_name = model_name if not self.trained else path
		self.num_labels = num_labels

		self.tokenizer = transformers.DistilBertTokenizer.from_pretrained(model_name)
		self.model = transformers.DistilBertForSequenceClassification.from_pretrained(model_name,
			num_labels = num_labels,
		)


	def __enter__(self):
		logging.info(f"Loading model {self.model_name}...")

		return self

	def __exit__(self, *_):
		self.model.save_pretrained(models_path / self.model_name)
		self.tokenizer.save_pretrained(models_path / self.model_name)

		return True


	def compile(self, dataset: TwitterDataset,
		training_args: transformers.training_args.TrainingArguments = transformers.training_args.TrainingArguments(
			output_dir = "./results",
			logging_dir = "./logs",

			eval_strategy = "epoch",
			save_strategy = "epoch",

			per_device_train_batch_size = 32,
			per_device_eval_batch_size = 512,
		#	gradient_accumulation_steps = 4,

			fp16 = True,

		#	dataloader_num_workers = 20,
		#	dataloader_persistent_workers = True,
			dataloader_pin_memory = True,

			data_seed = fix_seed(),
			seed = fix_seed(),

			num_train_epochs = 1,
			learning_rate = 1e-4,
			weight_decay = 1e-2,

			load_best_model_at_end = True,
		#	metric_for_best_model = "accuracy",  # `eval_loss` by default
		)
	):
		logging.info("Compiling model and initializing its trainer...")

		self.trainer = transformers.trainer.Trainer(
			model = self.model,
			args = training_args,
			train_dataset = dataset["train"],
			eval_dataset = dataset["val"],
			processing_class = self.tokenizer,
			compute_metrics = self.compute_metrics,
		)

	def fit(self) -> dict[str, float]:
		if self.trained:
			logging.info("Model already trained. Skipping training.")

			return dict()

		logging.info("Training model...")

		self.model.train()
		output = self.trainer.train()
		self.trained = True

		return output.metrics

	def evaluate(self) -> dict[str, float]:
		if not self.trained:
			logging.error("Model not trained. Cannot evaluate.")

			return dict()

		logging.info("Evaluating model...")

		self.model.eval()

		return self.trainer.evaluate()


	def predict(self, texts: list[str] | str) -> list[int]:
		return torch.argmax(self.logits(texts),
			dim = 1,
		).tolist()

	def predict_proba(self, texts: list[str] | str) -> list[float]:
		return torch.softmax(self.logits(texts),
			dim = 1,
		)[:, 1].tolist()

	def logits(self, texts: list[str] | str) -> torch.Tensor:
		if isinstance(texts, str):
			texts = [texts]

		def tokenize(batch):
			return self.tokenizer(batch["text"],
				padding = "max_length",
				truncation = True,
			)

		dummy_dataset = datasets.Dataset.from_dict({"text": texts})
		dummy_dataset = dummy_dataset.map(tokenize,
			batched = True,
		)

		dummy_dataset.set_format(
			type = "torch",
			columns = [
				"input_ids",
				"attention_mask",
			],
		)

		return torch.tensor(self.trainer.predict(dummy_dataset).predictions)  # type: ignore[arg-type]


	@classmethod
	def compute_metrics(cls, eval_pred) -> dict[str, float]:
		logging.info("Computing metrics...")

		y_pred, y_true = eval_pred
		y_pred = np.argmax(y_pred, axis = 1)

		return {
			"accuracy": sklearn.metrics.accuracy_score(y_true, y_pred),
			"precision": sklearn.metrics.precision_score(y_true, y_pred, average = "binary"),
			"recall": sklearn.metrics.recall_score(y_true, y_pred, average = "binary"),
			"f1": sklearn.metrics.f1_score(y_true, y_pred, average = "binary"),
		}  # type: ignore[return]

	def plot(self, dataset: TwitterDataset,
		output_dir: Path = Path("plots"),
	):
		output_dir.mkdir(
			parents = True,
			exist_ok = True,
		)
		logging.info("Plotting results...")

		# Learning curves:
		if self.trainer.state.log_history:
			logs = pd.DataFrame(self.trainer.state.log_history)

			# Filter out unnecessary entries
			train_logs = logs[logs["loss"].notna()]
			eval_logs = logs[logs["eval_loss"].notna()]

			# Plot train vs eval loss:
			plt.figure()
			plt.plot(train_logs["step"], train_logs["loss"], label="Train Loss")
			plt.plot(eval_logs["step"], eval_logs["eval_loss"], label="Eval Loss")
			plt.xlabel("Step")
			plt.ylabel("Loss")
			plt.legend()
			plt.title("Training vs Evaluation Loss")
			plt.savefig(output_dir / "loss_curve.png")
			plt.close()

			# Plot evaluation metrics:
			metrics = ["eval_accuracy", "eval_precision", "eval_recall", "eval_f1"]
			for metric in metrics:
				if metric in eval_logs:
					plt.figure()
					plt.plot(eval_logs["step"], eval_logs[metric], label=metric)
					plt.xlabel("Step")
					plt.ylabel(metric.split("_")[-1].capitalize())
					plt.title(metric.replace("_", " ").title())
					plt.savefig(output_dir / f"{metric}_curve.png")
					plt.close()

		# AUC and Precision-Recall Curve on validation set:
		logging.info("Generating ROC and PR curves from classifier predictions...")

		texts = dataset["val"]["text"]
		y_true = dataset["val"]["labels"]
		y_prob = self.predict_proba(texts)
	#	y_pred = self.predict(texts)

		# ROC Curve:
		fpr, tpr, _ = sklearn.metrics.roc_curve(y_true, y_prob)
		roc_auc = sklearn.metrics.auc(fpr, tpr)
		plt.figure()
		plt.plot(fpr, tpr, label=f"ROC AUC = {roc_auc:.2f}")
		plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
		plt.xlabel("False Positive Rate")
		plt.ylabel("True Positive Rate")
		plt.title("ROC Curve")
		plt.legend()
		plt.savefig(output_dir / "roc_curve.png")
		plt.close()

		# Precision-Recall Curve:
		precision, recall, _ = sklearn.metrics.precision_recall_curve(y_true, y_prob)
		pr_auc = sklearn.metrics.auc(recall, precision)
		plt.figure()
		plt.plot(recall, precision, label=f"PR AUC = {pr_auc:.2f}")
		plt.xlabel("Recall")
		plt.ylabel("Precision")
		plt.title("Precision-Recall Curve")
		plt.legend()
		plt.savefig(output_dir / "pr_curve.png")
		plt.close()

	def submit(self, dataset: TwitterDataset):
		logging.info("Submitting predictions...")

		pd.DataFrame(
			data = {
				"index": dataset["test"]["index"],
				"labels": self.predict(dataset["test"]["text"]),
			}
		).to_csv("submission.csv",
			index = False,
		)


2025-06-05 16:52:19.228890: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749142339.251209     131 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749142339.257993     131 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Run

Apply the tools setting up the pipeline to run an experiment.

In [None]:
fix_seed()

dataset = TwitterDataset.preprocessed(
    root = Path("/kaggle/input/ai-2-dl-for-nlp-2025-homework-3")
)

with TwitterClassifier("distilbert-base-uncased") as classifier:
	classifier.compile(dataset)
	classifier.fit()
	classifier.evaluate()
	classifier.plot(dataset)
	classifier.submit(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Casting the dataset:   0%|          | 0/148388 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/42396 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21199 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


Map:   0%|          | 0/148388 [00:00<?, ? examples/s]

Map:   0%|          | 0/42396 [00:00<?, ? examples/s]

Map:   0%|          | 0/21199 [00:00<?, ? examples/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.word_embeddings.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transformer.layer.0.

<IPython.core.display.Javascript object>