# **Fine-Tuning a Sentiment Analysis Model for Urdu IMDb Reviews**

[Click here to read blog](https://medium.com/@rameeshamalik.143/fine-tuning-sentiment-analysis-model-for-urdu-imdb-reviews-using-transformers-7710a715d233)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install Libraries

In [None]:
# Install essential libraries
!pip install torch transformers streamlit datasets


Collecting streamlit
  Downloading streamlit-1.40.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloadi

# Dataset

In [None]:

# Verify the dataset
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/NLP/Q2/imdb_urdu_reviews_train.csv')
print(data.head())


                                              review sentiment
0  میں نے اسے 80 کی دہائی کے وسط میں ایک کیبل گائ...  positive
1  چونکہ میں نے 80 کی دہائی میں انسپکٹر گیجٹ کارٹ...  negative
2  ایک ایسے معاشرے کی حالت کے بارے میں تعجب کرتا ...  positive
3  مفید البرٹ پیون کی طرف سے ایک اور ردی کی ٹوکری...  negative
4  یہ کولمبو ہے جس کی ہدایتکاری اپنے کیریئر کے اب...  positive


# Model and Tokenizer Setup

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load MobiLLaMA tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")  # Replace with specific model if needed
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Check model architecture
print(model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

#  Dataset Class

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from tqdm import tqdm

class UrduIMDbDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        # Convert labels to integers using a dictionary mapping
        self.labels = [1 if label == 'positive' else 0 for label in labels]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]  # Access using iloc for positional indexing
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)  # Ensure labels are integers
        }


# Split dataset
train_texts = data['review'][:800]
train_labels = data['sentiment'][:800]
val_texts = data['review'][800:]
val_labels = data['sentiment'][800:]

train_dataset = UrduIMDbDataset(train_texts, train_labels, tokenizer)
val_dataset = UrduIMDbDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Fine-Tuning the Model

In [None]:


# Fine-tuning process
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):  # Fine-tune for 3 epochs
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print progress
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 50/50 [00:22<00:00,  2.22it/s, loss=0.688]
Epoch 1: 100%|██████████| 50/50 [00:18<00:00,  2.76it/s, loss=0.721]
Epoch 2: 100%|██████████| 50/50 [00:18<00:00,  2.73it/s, loss=0.742]


# Evaluation

In [None]:
    # Evaluation
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():  # Disable gradient computation during evaluation
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()

            # Calculate accuracy
            preds = logits.argmax(dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct_predictions / total_predictions
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


Validation Loss: 0.6961, Validation Accuracy: 0.5026


# Saving and Loading the Model

In [None]:
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/NLP/Q2/urdu_imdb_model')
tokenizer.save_pretrained('/content/drive/MyDrive/NLP/Q2/urdu_imdb_model')


('/content/drive/MyDrive/NLP/Q2/urdu_imdb_model/tokenizer_config.json',
 '/content/drive/MyDrive/NLP/Q2/urdu_imdb_model/special_tokens_map.json',
 '/content/drive/MyDrive/NLP/Q2/urdu_imdb_model/vocab.txt',
 '/content/drive/MyDrive/NLP/Q2/urdu_imdb_model/added_tokens.json',
 '/content/drive/MyDrive/NLP/Q2/urdu_imdb_model/tokenizer.json')

# Prediction

In [None]:
# Load model and tokenizer (if you need to load it after saving)
model = model.from_pretrained('/content/drive/MyDrive/NLP/Q2/urdu_imdb_model')
tokenizer = tokenizer.from_pretrained('/content/drive/MyDrive/NLP/Q2/urdu_imdb_model')
model.to(device)

# Example inference function
def predict_sentiment(text):
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = logits.argmax(dim=1).item()  # 1 for positive, 0 for negative
        return "positive" if prediction == 1 else "negative"

# Example prediction
text = "کہانی کافی بورنگ تھی"
print(predict_sentiment(text))  # Output: "negative"

negative


# Classification Profile

In [None]:
from sklearn.metrics import classification_report

# Collect all predictions and labels for validation set
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        preds = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Print classification report
print(classification_report(all_labels, all_preds))


              precision    recall  f1-score   support

           0       0.50      1.00      0.67     19667
           1       0.61      0.00      0.01     19533

    accuracy                           0.50     39200
   macro avg       0.55      0.50      0.34     39200
weighted avg       0.55      0.50      0.34     39200

