In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [None]:
# Sample dataset
data = {
    'text': [
        "Wow!, this is an IITG internship",
        "This is the worst experience ever.",
        "It's okay, not great.",
        "Absolutely fantastic service.",
        "I'm not satisfied with the quality.",
        "The item is average.",
        "I love this product!",
        "I am not happy with it",
        "It's okay, nothing special.",
        "This is an amazing product! Highly recommend.",
        "I am so disappointed with the service.",
        "The weather is quite neutral today, neither good nor bad.",
        "Loved every bit of it, truly fantastic!",
        "This movie was terrible, a complete waste of time.",
        "It's okay, nothing special.",
        "Excellent customer support!",
        "Very poor quality, I regret buying it.",
        "The news report was unbiased and factual.",
        "Absolutely brilliant, couldn't be happier!",
        "I have mixed feelings about this, somewhat confusing.",
        "What a horrible experience.",
        "The food was decent.",
        "Such a wonderful day!",
        "I'm feeling indifferent.",
        "This service is perfect!",
        "Extremely frustrating situation.",
        "The article presented a balanced view."
    ],
    'label': ['positive',
              'negative',
              'neutral',
              'positive',
              'negative',
              'neutral',
              'positive',
              'negative',
              'neutral',
              'positive',
              'negative',
              'neutral',
              'positive',
              'negative',
              'neutral',
              'positive',
              'negative',
              'neutral',
              'positive',
              'neutral',
              'negative',
              'neutral',
              'positive',
              'neutral',
              'positive',
              'negative',
              'neutral'
              ]
}

df = pd.DataFrame(data)

In [None]:
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenizer(list(df['text']), truncation=True, padding=True, return_tensors='pt')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = SentimentDataset(encodings, list(df['label_id']))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=40,
    per_device_train_batch_size=2,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

In [None]:
trainer.train()

Step,Training Loss
10,0.7553
20,0.4525
30,0.1786
40,0.0688
50,0.0392
60,0.0061
70,0.0064
80,0.0028
90,0.002
100,0.0014


TrainOutput(global_step=560, training_loss=0.02744445334537886, metrics={'train_runtime': 68.9992, 'train_samples_per_second': 15.652, 'train_steps_per_second': 8.116, 'total_flos': 7770068117280.0, 'train_loss': 0.02744445334537886, 'epoch': 40.0})

In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    # Move input tensors to the same device as the model
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([pred])[0]

# Test examples
print(predict_sentiment("This is the best product I've used!"))  # Expected: positive
print(predict_sentiment("Wow! its an internship at IITG"))        # Expected: positive
print(predict_sentiment("It doesn't work at all."))              # Expected: negative
print(predict_sentiment("It's okay, nothing special."))          # Expected: neutral

positive
positive
negative
neutral
