In [31]:
import pandas as pd
import re
import transformers
from transformers import AutoTokenizer
import numpy as np

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Create a noisy dataset
data_dict = {
    "text": [
        "  The staff was very kind and attentive to my needs!!!  ",
        "The waiting time was too long, and the staff was rude. Visit us at http://hospitalreviews.com",
        "The doctor answered all my questions...but the facility was outdated.   ",
        "The nurse was compassionate & made me feel comfortable!! :) ",
        "I had to wait over an hour before being seen.  Unacceptable service! #frustrated",
        "The check-in process was smooth, but the doctor seemed rushed. Visit https://feedback.com",
        "Everyone I interacted with was professional and helpful.  "
    ],
    "label": ["positive", "negative", "neutral", "positive", "negative", "neutral", "positive"]
}

# Convert dataset to a DataFrame
data = pd.DataFrame(data_dict)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

# Clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

data["cleaned_text"] = data["text"].apply(clean_text)

# Convert labels to integers
label_map = {"positive": 0, "neutral": 1, "negative": 2}
data["label"] = data["label"].map(label_map)

# Tokenize the cleaned text
data['tokenized'] = data['cleaned_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# Pad or truncate to fixed length (e.g., 128 tokens)
data['padded_tokenized'] = data['tokenized'].apply(
    lambda x: x + [tokenizer.pad_token_id] * (128 - len(x)) if len(x) < 128 else x[:128]
)

# Preview cleaned and labeled data
print(data[['cleaned_text', 'label', 'padded_tokenized']].head())

                                        cleaned_text  label  \
0  the staff was very kind and attentive to my needs      0   
1  the waiting time was too long and the staff wa...      2   
2  the doctor answered all my questionsbut the fa...      1   
3  the nurse was compassionate made me feel comfo...      0   
4  i had to wait over an hour before being seen u...      2   

                                    padded_tokenized  
0  [101, 1996, 3095, 2001, 2200, 2785, 1998, 2012...  
1  [101, 1996, 3403, 2051, 2001, 2205, 2146, 1998...  
2  [101, 1996, 3460, 4660, 2035, 2026, 3980, 8569...  
3  [101, 1996, 6821, 2001, 29353, 2081, 2033, 251...  
4  [101, 1045, 2018, 2000, 3524, 2058, 2019, 3178...  


In [32]:
from sklearn.model_selection import train_test_split

# 1) split—but only stratify on the first split
train_df, temp_df = train_test_split(
    data,
    test_size=0.3,
    random_state=42,
    stratify=data["label"]
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
)


In [33]:
from datasets import Dataset


# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True, max_length=128)

# Tokenize the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text", "cleaned_text"])
val_dataset = val_dataset.remove_columns(["text", "cleaned_text"])
test_dataset = test_dataset.remove_columns(["text", "cleaned_text"])

# Convert labels to int if they are not already
train_dataset = train_dataset.map(lambda x: {"label": int(x["label"])})
val_dataset = val_dataset.map(lambda x: {"label": int(x["label"])})
test_dataset = test_dataset.map(lambda x: {"label": int(x["label"])})

# Print a sample to confirm input_ids exist
print(train_dataset[0])

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

{'label': 0, 'tokenized': [101, 1996, 6821, 2001, 29353, 2081, 2033, 2514, 6625, 102], 'padded_tokenized': [101, 1996, 6821, 2001, 29353, 2081, 2033, 2514, 6625, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], '__index_level_0__': 3, 'input_ids': [101, 1996, 6821, 2001, 29353, 2081, 2033, 2514, 6625, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0

In [34]:
# Load pre-trained BERT model
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Step 1: Freeze all layers except the last one (classification head)
for param in model.base_model.parameters():
    param.requires_grad = False

# If you'd like to fine-tune additional layers (e.g., the last 2 layers), you can unfreeze those layers as well
for param in model.base_model.encoder.layer[-2:].parameters():
    param.requires_grad = True

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from transformers import Trainer, TrainingArguments

# Step 1: Set training arguments for fine-tuning the model
#training_args = TrainingArguments(
    #output_dir='./results',             # Directory where results will be stored
    #num_train_epochs=3,                 # Number of epochs (full passes through the dataset)
    #per_device_train_batch_size=16,     # Batch size per GPU/CPU during training
    #eval_strategy="epoch",        # Evaluate the model at the end of each epoch
    #dataloader_pin_memory=False
#)
# -- PEFT -- #
# Example of adjusting learning rate for PEFT optimization
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=5e-5,  # Experiment with different learning rates
    num_train_epochs=5,
    per_device_train_batch_size=16,
    eval_strategy="epoch",
    dataloader_pin_memory=False
)

# Step 2: Fine-tune only the final classification head (since earlier layers were frozen)
trainer = Trainer(
    model=model, # Pre-trained BERT model with frozen layers
    args=training_args, # Training arguments
    train_dataset = train_dataset, # Training data for fine-tuning
    eval_dataset= val_dataset, # Validation data to evaluate performance
    compute_metrics=compute_metrics,
)

# Step 3: Train the model using PEFT (this performs PEFT because layers were frozen in Step 1)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.783913,1.0
2,No log,0.820497,1.0
3,No log,0.858277,1.0
4,No log,0.898941,1.0
5,No log,0.927345,0.0


Attempted to log scalar metric eval_loss:
0.7839133143424988
Attempted to log scalar metric eval_accuracy:
1.0
Attempted to log scalar metric eval_runtime:
0.0804
Attempted to log scalar metric eval_samples_per_second:
12.433
Attempted to log scalar metric eval_steps_per_second:
12.433
Attempted to log scalar metric epoch:
1.0
Attempted to log scalar metric eval_loss:
0.8204969167709351
Attempted to log scalar metric eval_accuracy:
1.0
Attempted to log scalar metric eval_runtime:
0.0834
Attempted to log scalar metric eval_samples_per_second:
11.992
Attempted to log scalar metric eval_steps_per_second:
11.992
Attempted to log scalar metric epoch:
2.0
Attempted to log scalar metric eval_loss:
0.8582774996757507
Attempted to log scalar metric eval_accuracy:
1.0
Attempted to log scalar metric eval_runtime:
0.0779
Attempted to log scalar metric eval_samples_per_second:
12.841
Attempted to log scalar metric eval_steps_per_second:
12.841
Attempted to log scalar metric epoch:
3.0
Attempted to 

TrainOutput(global_step=5, training_loss=0.7769490242004394, metrics={'train_runtime': 3.6238, 'train_samples_per_second': 5.519, 'train_steps_per_second': 1.38, 'total_flos': 1315567088640.0, 'train_loss': 0.7769490242004394, 'epoch': 5.0})

In [40]:
# Evaluate the model
results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Loss:     {results['eval_loss']:.4f}")
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")

Attempted to log scalar metric eval_loss:
1.0348620414733887
Attempted to log scalar metric eval_accuracy:
0.5
Attempted to log scalar metric eval_runtime:
0.1388
Attempted to log scalar metric eval_samples_per_second:
14.411
Attempted to log scalar metric eval_steps_per_second:
7.206
Attempted to log scalar metric epoch:
5.0
Test Loss:     1.0349
Test Accuracy: 0.5000
