In [None]:
import pandas as pd
import re
import transformers
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Create a noisy dataset
data_dict = {
    "text": [
        "  The staff was very kind and attentive to my needs!!!  ",
        "The waiting time was too long, and the staff was rude. Visit us at http://hospitalreviews.com",
        "The doctor answered all my questions...but the facility was outdated.   ",
        "The nurse was compassionate & made me feel comfortable!! :) ",
        "I had to wait over an hour before being seen.  Unacceptable service! #frustrated",
        "The check-in process was smooth, but the doctor seemed rushed. Visit https://feedback.com",
        "Everyone I interacted with was professional and helpful.  "
    ],
    "label": ["positive", "negative", "neutral", "positive", "negative", "neutral", "positive"]
}

# Convert dataset to a DataFrame
data = pd.DataFrame(data_dict)

# Clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

data["cleaned_text"] = data["text"].apply(clean_text)

# Convert labels to integers
label_map = {"positive": 0, "neutral": 1, "negative": 2}
data["label"] = data["label"].map(label_map)

# Tokenize the cleaned text
data['tokenized'] = data['cleaned_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Pad or truncate to fixed length (e.g., 128 tokens)
data['padded_tokenized'] = data['tokenized'].apply(
    lambda x: x + [tokenizer.pad_token_id] * (128 - len(x)) if len(x) < 128 else x[:128]
)

# Preview cleaned and labeled data
print(data[['cleaned_text', 'label', 'padded_tokenized']].head())

In [18]:
from sklearn.model_selection import train_test_split

# only stratify the first split, then do an un-stratified val/test split
train_data, temp_data = train_test_split(
    data, test_size=0.3, random_state=42, stratify=data["label"]
)

val_data, test_data = train_test_split(
    temp_data, test_size=0.5, random_state=42
)



In [19]:
from collections import Counter

print("Train size:", len(train_data), "Label counts:", Counter(train_data["label"]))
print(" Val size:", len(val_data),   "Label counts:", Counter(val_data["label"]))
print(" Test size:", len(test_data), "Label counts:", Counter(test_data["label"]))

Train size: 4 Label counts: Counter({0: 2, 2: 1, 1: 1})
 Val size: 1 Label counts: Counter({2: 1})
 Test size: 2 Label counts: Counter({0: 1, 1: 1})


In [20]:
from datasets import Dataset


# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True, max_length=128)

# Tokenize the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text", "cleaned_text"])
val_dataset = val_dataset.remove_columns(["text", "cleaned_text"])
test_dataset = test_dataset.remove_columns(["text", "cleaned_text"])

# Convert labels to int if they are not already
train_dataset = train_dataset.map(lambda x: {"label": int(x["label"])})
val_dataset = val_dataset.map(lambda x: {"label": int(x["label"])})
test_dataset = test_dataset.map(lambda x: {"label": int(x["label"])})

# Print a sample to confirm input_ids exist
print(train_dataset[0])

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

{'label': 0, 'tokenized': [101, 1996, 6821, 2001, 29353, 2081, 2033, 2514, 6625, 102], 'padded_tokenized': [101, 1996, 6821, 2001, 29353, 2081, 2033, 2514, 6625, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], '__index_level_0__': 3, 'input_ids': [101, 1996, 6821, 2001, 29353, 2081, 2033, 2514, 6625, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0

In [21]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

training_args = TrainingArguments(
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    output_dir='./results',
    eval_strategy="epoch",
    logging_strategy="epoch",
    logging_dir='./logs',
    save_strategy="epoch",
    load_best_model_at_end=True,
    use_cpu=True,
)

# **Explain 'evaluation_strategy':**
# This determines when the model is evaluated. 'Epoch' evaluates the model after each training epoch.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.with_format("torch", columns=["input_ids", "attention_mask", "label"]),
    eval_dataset=val_dataset.with_format("torch", columns=["input_ids", "attention_mask", "label"])
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.3012,0.869452
2,1.2122,0.898188
3,1.0312,0.914075


Attempted to log scalar metric loss:
1.3012
Attempted to log scalar metric grad_norm:
6.896853446960449
Attempted to log scalar metric learning_rate:
2e-05
Attempted to log scalar metric epoch:
1.0
Attempted to log scalar metric eval_loss:
0.8694517016410828
Attempted to log scalar metric eval_runtime:
0.0784
Attempted to log scalar metric eval_samples_per_second:
12.756
Attempted to log scalar metric eval_steps_per_second:
12.756
Attempted to log scalar metric epoch:
1.0
Attempted to log scalar metric loss:
1.2122
Attempted to log scalar metric grad_norm:
5.6191487312316895
Attempted to log scalar metric learning_rate:
1.3333333333333333e-05
Attempted to log scalar metric epoch:
2.0
Attempted to log scalar metric eval_loss:
0.8981879949569702
Attempted to log scalar metric eval_runtime:
0.082
Attempted to log scalar metric eval_samples_per_second:
12.189
Attempted to log scalar metric eval_steps_per_second:
12.189
Attempted to log scalar metric epoch:
2.0
Attempted to log scalar metri

TrainOutput(global_step=3, training_loss=1.1815606355667114, metrics={'train_runtime': 9.8356, 'train_samples_per_second': 1.22, 'train_steps_per_second': 0.305, 'total_flos': 789340253184.0, 'train_loss': 1.1815606355667114, 'epoch': 3.0})

In [23]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# 1) run predict
pred_out = trainer.predict(test_dataset)

# 2) extract predictions and true labels
preds  = np.argmax(pred_out.predictions, axis=1)
labels = pred_out.label_ids

# 3) compute metrics
accuracy = accuracy_score(labels, preds)
f1       = f1_score(labels, preds, average="weighted")

print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")


Accuracy: 0.0000, F1 Score: 0.0000


In [24]:
import numpy as np
from collections import Counter

# 1. Run prediction
pred_out = trainer.predict(test_dataset)

# 2. Examine raw logits
logits = pred_out.predictions
print("Logits shape:", logits.shape)
print("First sample logits:", logits[0])

# 3. Compute preds via argmax
preds = np.argmax(logits, axis=-1)
print("Unique predicted classes & counts:", Counter(preds))

# 4. Grab true labels from the predict output
labels = pred_out.label_ids
print("Unique true labels & counts:", Counter(labels))

# 5. Sanity-check metrics
from sklearn.metrics import accuracy_score, f1_score
print("Accuracy:", accuracy_score(labels, preds))
print("F1 (weighted):", f1_score(labels, preds, average="weighted"))


Logits shape: (2, 3)
First sample logits: [-0.35052657 -0.29044142  0.11750984]
Unique predicted classes & counts: Counter({np.int64(2): 2})
Unique true labels & counts: Counter({np.int64(0): 1, np.int64(1): 1})
Accuracy: 0.0
F1 (weighted): 0.0
