# Fine-Tuning BERT for Text Classification

# Dataset Preparation
## Load the CSV

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW # Import AdamW from torch.optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = '/content/drive/MyDrive/BertDataset/comments.csv'
df = pd.read_csv(file_path)

## check

In [4]:
# prompt: print top 5 rows

print(df.head())


                                                text     label
0  ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...  Positive
1  أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...  Positive
2  هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...  Positive
3  خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...  Positive
4  ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...  Positive


In [5]:
print(f"Total rows in CSV: {len(df)}")
print(df.head())

Total rows in CSV: 97554
                                                text     label
0  ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...  Positive
1  أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...  Positive
2  هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...  Positive
3  خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...  Positive
4  ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...  Positive


In [6]:
# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

## Preprocess Text

In [7]:
# Lowercase all labels
df['label'] = df['label'].str.lower()


In [8]:
# Clean text - remove only specified symbols
def clean_text(text):
    return re.sub(r'[?&#$%@*^]', '', str(text))

df['cleaned_text'] = df['text'].apply(clean_text)

# Convert labels to numerical values
label_map = {'positive': 0, 'mixed': 1, 'negative': 2}
df['label_encoded'] = df['label'].map(label_map)

In [9]:
# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [10]:
# Check if cleaning produced valid text
print("Sample texts after cleaning:")
print(df['cleaned_text'].head())
print(f"Empty texts after cleaning: {df['cleaned_text'].isnull().sum()}")

Sample texts after cleaning:
0    مرضي. قرب المكان من الحرم استقبالك بالقهوه وبا...
1    مخيب للأمل. السعر فقط. النظافة سيئة جدا والموا...
2    تعرفت علي تفاصيل لم تكن معلومة بالنسبه لي عن ع...
3    لا يمكن أن يكون من الكتب التي تقرأ مره واحدة ....
4         جيد جدا . ما افطرت. عدم وجود مواقف للسيارة،،
Name: cleaned_text, dtype: object
Empty texts after cleaning: 0


In [11]:
print("Unique labels in dataset:", df['label'].unique())

Unique labels in dataset: ['mixed' 'negative' 'positive' nan]


In [12]:
print("NaN in label_encoded:", df['label_encoded'].isna().sum())

NaN in label_encoded: 3


In [13]:
# Remove rows with NaN values in 'label_encoded' column
df = df.dropna(subset=['label_encoded'])

# Verify if there are any NaN values left
print("NaN values in 'label_encoded' after dropping rows:", df['label_encoded'].isna().sum())


NaN values in 'label_encoded' after dropping rows: 0


In [14]:
test_text = "هل هذا مثال؟ #اختبار"
cleaned = re.sub(r'[?&#$%@*^]', '', test_text)
print(f"Before: {test_text}\nAfter: {cleaned}")

Before: هل هذا مثال؟ #اختبار
After: هل هذا مثال؟ اختبار


##Split Data (70% Train, 30% Test

In [15]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['cleaned_text'],
    df['label_encoded'],
    test_size=0.3,
    random_state=42,
    stratify=df['label_encoded']  # Maintain class balance
)

# BERT Model Setup

## Choose Model

In [16]:
# Load Arabic BERT model and tokenizer
model_name = 'asafaya/bert-base-arabic'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenization

##Determine max_len

In [17]:
all_lengths = [len(tokenizer.encode(text)) for text in df['cleaned_text']]
max_len = max(all_lengths) + 2  # Add 2 for [CLS] and [SEP]
print(f"Using max_len: {max_len}")

Using max_len: 442


In [18]:
train_encodings = tokenizer(
    train_texts.tolist(),
    truncation=True,
    padding='max_length',
    max_length=max_len,
    return_tensors='pt'
)

test_encodings = tokenizer(
    test_texts.tolist(),
    truncation=True,
    padding='max_length',
    max_length=max_len,
    return_tensors='pt'
)

# Training Configuration
## Batch Size Selection

In [19]:
# Create PyTorch datasets
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(train_labels.tolist(), dtype=torch.long) # Ensure labels are LongTensor
)

test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    torch.tensor(test_labels.tolist(), dtype=torch.long) # Ensure labels are LongTensor
)

In [20]:
# Determine batch size (start with 8 and increase)
batch_size = 8
gradient_accumulation_steps = 8

## Hyperparameters

In [21]:
# Create data loaders
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

test_dataloader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),
    batch_size=batch_size
)

# Train Setup

In [22]:
# prompt: print labels to check format

print(df['label'].unique())
print(df['label_encoded'].unique())


['mixed' 'negative' 'positive']
[1. 2. 0.]


In [23]:
# prompt: check if lables 1d shape
# Check if labels are 1D
print(f"Shape of train_labels: {train_labels.shape}")
print(f"Shape of test_labels: {test_labels.shape}")

if train_labels.ndim == 1 and test_labels.ndim ==1:
    print("Labels are 1D arrays")
else:
    print("Labels are NOT 1D arrays")


Shape of train_labels: (68285,)
Shape of test_labels: (29266,)
Labels are 1D arrays


In [24]:
# prompt: convert float lables into integeres, print new lables

# Convert float labels to integers
df['label_encoded'] = df['label_encoded'].astype(int)

# Print the new labels
print(df['label_encoded'])


0        1
1        2
2        0
3        0
4        0
        ..
97549    0
97550    1
97551    2
97552    0
97553    0
Name: label_encoded, Length: 97551, dtype: int64


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_dataloader) // gradient_accumulation_steps
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

In [26]:
#Training loop (1 epoch)
model.train()
for epoch in range(1):
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        # Convert labels to one-hot encoding
        labels = batch[2]
        one_hot_labels = torch.nn.functional.one_hot(labels, num_classes=3).float().to(device)
        inputs['labels'] = one_hot_labels

        outputs = model(**inputs)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_train_loss:.4f}")

Epoch 1, Average Loss: 0.0514


#  Evaluation

In [29]:
#Training loop (1 epoch)
model.train()
for epoch in range(1):
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]  # Use the original class indices as labels
        }

        # Remove the one-hot encoding conversion

        outputs = model(**inputs)  # Model handles loss calculation internally
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps
        loss.backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_train_loss:.4f}")

ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 3]))

## Confusion Matrix

In [30]:
# Confusion matrix
cm = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_map.keys(),
            yticklabels=label_map.keys())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


ValueError: zero-size array to reduction operation fmin which has no identity

<Figure size 800x600 with 0 Axes>

# Test on Custom Sentences

In [31]:
custom_sentences = [
    "الفيلم كان رائعاً!",                   # Positive
    "الخدمة مقبولة لكن بطيئة",              # Mixed
    "أسوأ تجربة في حياتي"                   # Negative
]

cleaned_sentences = [clean_text(s) for s in custom_sentences]
encoded_inputs = tokenizer(
    cleaned_sentences,
    truncation=True,
    padding='max_length',
    max_length=max_len,
    return_tensors='pt'
).to(device)

with torch.no_grad():
    outputs = model(**encoded_inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

reverse_label_map = {v: k for k, v in label_map.items()}
for i, sentence in enumerate(custom_sentences):
    print(f"Sentence: {sentence}")
    print(f"Predicted sentiment: {reverse_label_map[predictions[i].item()]}")
    print("-" * 50)

Sentence: الفيلم كان رائعاً!
Predicted sentiment: negative
--------------------------------------------------
Sentence: الخدمة مقبولة لكن بطيئة
Predicted sentiment: mixed
--------------------------------------------------
Sentence: أسوأ تجربة في حياتي
Predicted sentiment: negative
--------------------------------------------------


##