In [None]:

import os
os.environ["WANDB_DISABLED"] = "true"  # Disable W&B logging

In [None]:
# Install dependencies (if running in Google Colab or locally)
# !pip install transformers datasets scikit-learn pandas torch

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset as HFDataset

In [None]:
# 1. Load dataset
import pandas as pd

df = pd.read_csv("/content/Bengali Review Dataset.csv")

print("First 5 rows:")
print(df.head())

First 5 rows:
                              Reviews Sentiment
0     অসাধারণ নিশো বস্ আর অমি ভাইকেও।  positive
1   "এত মোটা বাশ নিতে পারছি না বাবা "  negative
2                  নাটক আসলেই অসাধারণ  positive
3                     ফালতু একটা নাটক  negative
4         ধুমপান সাস্থর জন্য ক্ষতিকর।  negative


In [None]:
# Assuming the CSV has columns: 'review' and 'label'
text_column = 'Reviews'  # Change if your column name is different
label_column = 'Sentiment'  # Change if your column name is different

In [None]:
# 2. Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df[label_column])

In [None]:
from sklearn.model_selection import train_test_split

# 3. Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# 4. Load tokenizer
from transformers import AutoTokenizer
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [None]:
from datasets import Dataset as HFDataset

# 5. Tokenization function
def tokenize_function(examples):
    return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=128)

# Convert pandas DataFrame → Hugging Face Dataset
# Ensure label column is integer type and rename it to 'labels'
train_df['label_encoded'] = train_df['label_encoded'].astype(int)
test_df['label_encoded'] = test_df['label_encoded'].astype(int)

train_dataset = HFDataset.from_pandas(train_df[[text_column, 'label_encoded']].rename(columns={'label_encoded': 'labels'}))
test_dataset = HFDataset.from_pandas(test_df[[text_column, 'label_encoded']].rename(columns={'label_encoded': 'labels'}))

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Select only the columns required by the Trainer
train_dataset = train_dataset.remove_columns([text_column])
test_dataset = test_dataset.remove_columns([text_column])

Map:   0%|          | 0/9445 [00:00<?, ? examples/s]

Map:   0%|          | 0/2362 [00:00<?, ? examples/s]

In [None]:
# 6. Load model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 7. Training arguments
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
from transformers import Trainer

# 8. Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [None]:
# 9. Train model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1467,0.156554
2,0.1201,0.154665
3,0.0389,0.1919


TrainOutput(global_step=1773, training_loss=0.11524549175371104, metrics={'train_runtime': 895.262, 'train_samples_per_second': 31.65, 'train_steps_per_second': 1.98, 'total_flos': 1863812938406400.0, 'train_loss': 0.11524549175371104, 'epoch': 3.0})

In [None]:
# 10. Evaluate model
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.15466530621051788, 'eval_runtime': 17.4683, 'eval_samples_per_second': 135.216, 'eval_steps_per_second': 8.472, 'epoch': 3.0}


In [None]:
# 11. Test with a custom review
sample_text = "এই সিনেমাটি ভালো লাগেনি ।"
inputs = tokenizer(sample_text, return_tensors="pt", padding=True, truncation=True, max_length=128)

# Move inputs to the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: value.to(device) for key, value in inputs.items()}
model.to(device) # Ensure model is on the same device

outputs = model(**inputs)
predicted_class = torch.argmax(outputs.logits, dim=1).item()
print(sample_text)
print("Predicted Sentiment:", label_encoder.inverse_transform([predicted_class])[0])

এই সিনেমাটি ভালো লাগেনি ।
Predicted Sentiment: negative


🔹 Model Used

BanglaBERT: "sagorsarker/bangla-bert-base"

Task: Sentiment classification (labels taken from your dataset's Sentiment column)

🔹 Optimizer

Automatically handled by Hugging Face Trainer (uses AdamW internally)

Learning rate: 2e-5

Weight decay: 0.01

🔹 Training Setup

Epochs: 3

Batch size: 16 (train & eval)

Train/test split: 80% train, 20% test

Device: GPU if available



🔹 Evaluation

Done using trainer.evaluate() on the test dataset

Metrics returned: accuracy, loss (default Trainer behavior)


🔹 Prediction Method

Tokenize new Bangla text with AutoTokenizer

Pass to trained model → take argmax over logits

Map predicted index back to sentiment label using LabelEncoder