In [None]:
!pip install transformers==4.51.3 pandas scikit-learn torch numpy tqdm



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm
import numpy as np
# Load the data
df = pd.read_csv('train_data.csv')


In [None]:
df = df.drop(columns = ['Unnamed: 0'])
df.head()

Unnamed: 0,text,label
0,"*in a friendly, approachable manner* Absolutel...",haiku
1,The main difference between a net profit share...,meta
2,"Okay, let me walk you through the key differen...",haiku
3,Let me walk you through the key points about t...,meta
4,"Okay, let me walk you through the key differen...",haiku


In [None]:
df = df.dropna(subset=['text', 'label'])

# Ensure all texts are strings
texts = df['text'].astype(str).tolist()
labels = df['label'].map({'haiku': 0, 'meta': 1}).tolist()


# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Create torch datasets
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels)
)
val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [None]:
# Move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 12
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validation'):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                      'labels': batch[2]}
            outputs = model(**inputs)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == inputs['labels']).sum().item()
            total += inputs['labels'].size(0)

    val_accuracy = correct / total
    print(f'Validation Loss: {val_loss / len(val_loader):.4f}, Accuracy: {val_accuracy:.4f}')

# Save the model
model.save_pretrained('/fine_tuned_bert_Majidi')
tokenizer.save_pretrained('/fine_tuned_bert_Majidi')

print("Fine-tuning complete. Model saved.")

Epoch 1/12: 100%|██████████| 210/210 [05:05<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0478, Accuracy: 0.9881


Epoch 2/12: 100%|██████████| 210/210 [05:04<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0214, Accuracy: 0.9929


Epoch 3/12: 100%|██████████| 210/210 [05:04<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.00it/s]


Validation Loss: 0.0196, Accuracy: 0.9941


Epoch 4/12: 100%|██████████| 210/210 [05:05<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0308, Accuracy: 0.9952


Epoch 5/12: 100%|██████████| 210/210 [05:05<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0178, Accuracy: 0.9941


Epoch 6/12: 100%|██████████| 210/210 [05:05<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0184, Accuracy: 0.9941


Epoch 7/12: 100%|██████████| 210/210 [05:05<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0180, Accuracy: 0.9941


Epoch 8/12: 100%|██████████| 210/210 [05:04<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0188, Accuracy: 0.9941


Epoch 9/12: 100%|██████████| 210/210 [05:04<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0194, Accuracy: 0.9941


Epoch 10/12: 100%|██████████| 210/210 [05:04<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0200, Accuracy: 0.9941


Epoch 11/12: 100%|██████████| 210/210 [05:04<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0207, Accuracy: 0.9941


Epoch 12/12: 100%|██████████| 210/210 [05:04<00:00,  1.45s/it]
Validation: 100%|██████████| 53/53 [00:26<00:00,  2.01it/s]


Validation Loss: 0.0212, Accuracy: 0.9941
Fine-tuning complete. Model saved.


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score
import torch
from torch.utils.data import DataLoader, TensorDataset

# Load the fine-tuned model and tokenizer
model_path = '/fine_tuned_bert_Majidi'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

# Tokenize validation texts again
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels)
)
val_loader = DataLoader(val_dataset, batch_size=16)

# Predict
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [t.to(device) for t in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Full classification report
report = classification_report(all_labels, all_preds, target_names=['haiku', 'meta'])
print("\nClassification Report:\n", report)


Validation Accuracy: 0.9941
Precision: 0.9941
Recall: 0.9941
F1 Score: 0.9941

Classification Report:
               precision    recall  f1-score   support

       haiku       0.99      1.00      0.99       415
        meta       1.00      0.99      0.99       426

    accuracy                           0.99       841
   macro avg       0.99      0.99      0.99       841
weighted avg       0.99      0.99      0.99       841

