##  1. Data Collection 

In [1]:
import pandas as pd
import re
from transformers import AutoTokenizer

#load token
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


# Create a noisy dataset
data_dict = {
    "text": [
        "  The staff was very kind and attentive to my needs!!!  ",
        "The waiting time was too long, and the staff was rude. Visit us at http://hospitalreviews.com",
        "The doctor answered all my questions...but the facility was outdated.   ",
        "The nurse was compassionate & made me feel comfortable!! :) ",
        "I had to wait over an hour before being seen.  Unacceptable service! #frustrated",
        "The check-in process was smooth, but the doctor seemed rushed. Visit https://feedback.com",
        "Everyone I interacted with was professional and helpful. 😊  "
    ],
    "label": ["positive", "negative", "neutral", "positive", "negative", "neutral", "positive"]
}

# Convert dataset to a DataFrame
data = pd.DataFrame(data_dict)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

data['cleaned_text'] = data['text'].apply(clean_text)

In [3]:
label_map = {"positive": 0, "neutral": 1, "negative": 2}
data["label"] = data["label"].map(label_map)

In [4]:
data.head()

Unnamed: 0,text,label,cleaned_text
0,The staff was very kind and attentive to my ...,0,the staff was very kind and attentive to my needs
1,"The waiting time was too long, and the staff w...",2,the waiting time was too long and the staff wa...
2,The doctor answered all my questions...but the...,1,the doctor answered all my questionsbut the fa...
3,The nurse was compassionate & made me feel com...,0,the nurse was compassionate made me feel comfo...
4,I had to wait over an hour before being seen. ...,2,i had to wait over an hour before being seen u...


## 1.1 Tokenize the data

In [5]:
# Apply tokenization with padding
def tokenize_function(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=128)

# Apply tokenization
data["tokenized"] = data["cleaned_text"].apply(tokenize_function)

In [6]:
# Extract tokenized features
data["input_ids"] = data["tokenized"].apply(lambda x: x["input_ids"])
data["attention_mask"] = data["tokenized"].apply(lambda x: x["attention_mask"])

# Drop old tokenized column
data = data.drop(columns=["tokenized"])

data.head()

Unnamed: 0,text,label,cleaned_text,input_ids,attention_mask
0,The staff was very kind and attentive to my ...,0,the staff was very kind and attentive to my needs,"[101, 1996, 3095, 2001, 2200, 2785, 1998, 2012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
1,"The waiting time was too long, and the staff w...",2,the waiting time was too long and the staff wa...,"[101, 1996, 3403, 2051, 2001, 2205, 2146, 1998...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,The doctor answered all my questions...but the...,1,the doctor answered all my questionsbut the fa...,"[101, 1996, 3460, 4660, 2035, 2026, 3980, 8569...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
3,The nurse was compassionate & made me feel com...,0,the nurse was compassionate made me feel comfo...,"[101, 1996, 6821, 2001, 29353, 2081, 2033, 251...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
4,I had to wait over an hour before being seen. ...,2,i had to wait over an hour before being seen u...,"[101, 1045, 2018, 2000, 3524, 2058, 2019, 3178...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# 2. Split the dataset

In [7]:
from sklearn.model_selection import train_test_split

# Split data: 70% training, 15% validation, 15% test
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training Size: {len(train_data)}, Validation Size: {len(val_data)}, Test Size: {len(test_data)}")

Training Size: 4, Validation Size: 1, Test Size: 2


# 3. setup

In [8]:
from datasets import Dataset

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text", "cleaned_text"])
val_dataset = val_dataset.remove_columns(["text", "cleaned_text"])
test_dataset = test_dataset.remove_columns(["text", "cleaned_text"])

# Print a sample to confirm input_ids exist
print(train_dataset[0])

{'label': 1, 'input_ids': [101, 1996, 3460, 4660, 2035, 2026, 3980, 8569, 2102, 1996, 4322, 2001, 25963, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], '__index_level_0__': 2}


## 4. configure hyperparameters

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments

# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

training_args = TrainingArguments(
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    output_dir='./results',
    eval_strategy="epoch",
    logging_strategy="epoch",  
    logging_dir='./logs',  
    save_strategy="epoch",  
    load_best_model_at_end=True 
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5. Fine-tune the model 

In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.with_format("torch", columns=["input_ids", "attention_mask", "label"]),
    eval_dataset=val_dataset.with_format("torch", columns=["input_ids", "attention_mask", "label"])
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0857,1.393322
2,0.9837,1.408187
3,1.0079,1.40727
4,1.0016,1.404393
5,1.0469,1.403058


TrainOutput(global_step=5, training_loss=1.0251389741897583, metrics={'train_runtime': 122.6737, 'train_samples_per_second': 0.163, 'train_steps_per_second': 0.041, 'total_flos': 1315567088640.0, 'train_loss': 1.0251389741897583, 'epoch': 5.0})

## Evaluate the model

In [15]:
from sklearn.metrics import accuracy_score, f1_score

# Generate predictions
test_dataset = test_dataset.with_format("torch", columns=["input_ids", "attention_mask", "label"])
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = test_dataset["label"]

# Calculate metrics
accuracy = accuracy_score(labels, preds)
f1 = f1_score(labels, preds, average="weighted")

print(f"Accuracy: {accuracy}, F1 Score: {f1}")

# **Explain metric importance**:
# High F1 scores indicate balanced performance across all classes, crucial in tasks like sentiment analysis.

Accuracy: 0.5, F1 Score: 0.3333333333333333


the model is performing bad cuz of the small dataset