<a href="https://colab.research.google.com/github/Sanchit9587/Pokemon_Hack2_Guild_App/blob/NER%26NLP/FineTuning_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers==4.41.2
!pip install torch==2.3.0
!pip install scikit-learn==1.5.0
!pip install accelerate==0.30.1

Collecting transformers==4.41.2
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.2)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m112.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m123.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers


In [3]:
import json
import glob
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np


In [7]:
print("Step 1: Setting up configuration...")

# --- Configuration Block ---
# You can modify these hyperparameters
MODEL_CHECKPOINT = "bert-large-cased"
MAX_LENGTH = 256  # Max sequence length
BATCH_SIZE = 4    # Batch size for training, kept small for T4 GPU
ACCUMULATION_STEPS = 4 # Gradient accumulation steps to simulate a larger batch size (4 * 4 = 16)
EPOCHS = 5        # Number of training epochs
LEARNING_RATE = 3e-5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Google Drive Path ---
# Make sure to mount your Google Drive in Colab before running this script
DRIVE_MOUNT_PATH = "/content/drive"
SAVE_PATH = os.path.join(DRIVE_MOUNT_PATH, "MyDrive/PokemonNERModel")

# Create necessary directories
os.makedirs(SAVE_PATH, exist_ok=True)


print(f"Device: {DEVICE}")
print(f"Model will be saved to: {SAVE_PATH}")
print("Configuration complete.")

Step 1: Setting up configuration...
Device: cuda
Model will be saved to: /content/drive/MyDrive/PokemonNERModel
Configuration complete.


In [8]:
print("\nStep 2: Mounting Google Drive...")
try:
    from google.colab import drive
    drive.mount(DRIVE_MOUNT_PATH)
    print("Google Drive mounted successfully.")
except ImportError:
    print("Not in a Colab environment. Skipping Google Drive mount.")


Step 2: Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.


In [9]:
print("\nStep 3: Loading and preparing data...")

# The user specified the dataset file path
DATA_FILE_PATH = "/content/dataset.json"

# Load the single JSON file specified by the user
all_data = []
try:
    with open(DATA_FILE_PATH, 'r') as f:
        all_data = json.load(f)
    print(f"Successfully loaded {len(all_data)} examples from '{DATA_FILE_PATH}'.")
except FileNotFoundError:
    print("\n!!! ERROR !!!")
    print(f"Dataset file not found at '{DATA_FILE_PATH}'.")
    print("Please make sure you have uploaded 'dataset.json' to the '/content/' directory in Colab.")
    # Create a dummy data to allow the script to run without crashing
    all_data = [{
        "text": "Neutralize the Bulbasaur. Protect the Pikachu.",
        "annotations": [["Neutralize", "O"], ["the", "O"], ["Bulbasaur", "B-ENEMY_SPECIES"], [".", "O"], ["Protect", "O"], ["the", "O"], ["Pikachu", "B-FRIENDLY_SPECIES"], [".", "O"]]
    }]
    print("Loaded a dummy example to prevent a crash. Please upload your real data.")


# --- Define Labels ---
# Create mappings for our specific NER tags
unique_tags = sorted(list(set(tag for example in all_data for _, tag in example['annotations'])))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}
NUM_LABELS = len(unique_tags)

# Save the mappings to the drive for later use during inference
with open(os.path.join(SAVE_PATH, 'tag_mappings.json'), 'w') as f:
    json.dump({'tag2id': tag2id, 'id2tag': id2tag}, f)

print("Label mappings created and saved.")
print(f"Tags: {tag2id}")

# --- Split the Dataset ---
# Using percentages as requested: 80% train, 10% validation, 10% test
train_data, temp_data = train_test_split(all_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Data split complete:")
print(f" - Training examples: {len(train_data)}")
print(f" - Validation examples: {len(val_data)}")
print(f" - Testing examples: {len(test_data)}")


Step 3: Loading and preparing data...
Successfully loaded 199 examples from '/content/dataset.json'.
Label mappings created and saved.
Tags: {'B-ENEMY_SPECIES': 0, 'B-FRIENDLY_SPECIES': 1, 'O': 2}
Data split complete:
 - Training examples: 159
 - Validation examples: 20
 - Testing examples: 20


In [10]:
print("\nStep 4: Setting up tokenizer and datasets...")
tokenizer = BertTokenizerFast.from_pretrained(MODEL_CHECKPOINT)

class PokemonNERDataset(Dataset):
    def __init__(self, data, tokenizer, tag2id, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.tag2id = tag2id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        text_tokens = [ann[0] for ann in example['annotations']]
        text_labels = [ann[1] for ann in example['annotations']]

        # Tokenize the words
        tokenized_inputs = self.tokenizer(
            text_tokens,
            is_split_into_words=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Align labels with WordPiece tokens
        labels = []
        word_ids = tokenized_inputs.word_ids()
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                # Special token ([CLS], [SEP], [PAD])
                labels.append(-100)
            elif word_idx != previous_word_idx:
                # First token of a new word
                labels.append(self.tag2id[text_labels[word_idx]])
            else:
                # Subsequent sub-tokens of the same word
                labels.append(-100)
            previous_word_idx = word_idx

        # Squeeze tensors to remove the batch dimension
        tokenized_inputs["input_ids"] = tokenized_inputs["input_ids"].squeeze()
        tokenized_inputs["attention_mask"] = tokenized_inputs["attention_mask"].squeeze()
        tokenized_inputs["token_type_ids"] = tokenized_inputs["token_type_ids"].squeeze()
        tokenized_inputs["labels"] = torch.LongTensor(labels)

        return tokenized_inputs

# --- Create Dataset instances ---
train_dataset = PokemonNERDataset(train_data, tokenizer, tag2id, MAX_LENGTH)
val_dataset = PokemonNERDataset(val_data, tokenizer, tag2id, MAX_LENGTH)

print("PyTorch Datasets created successfully.")


Step 4: Setting up tokenizer and datasets...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

PyTorch Datasets created successfully.


In [11]:
print("\nStep 5: Loading model and setting up for training...")

# --- Load the Model ---
model = BertForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS,
    id2label=id2tag,
    label2id=tag2id
)
model.to(DEVICE)
print(f"'{MODEL_CHECKPOINT}' loaded successfully.")

# --- Dataloaders ---
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# --- Optimizer and Scheduler ---
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = (len(train_dataloader) // ACCUMULATION_STEPS) * EPOCHS
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print("Optimizer and learning rate scheduler are ready.")



Step 5: Loading model and setting up for training...


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'bert-large-cased' loaded successfully.
Optimizer and learning rate scheduler are ready.




In [12]:
print("\nStep 6: Starting training process...")

for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch + 1} / {EPOCHS} ---")
    # --- Training Phase ---
    model.train()
    total_train_loss = 0
    optimizer.zero_grad() # Zero gradients at the start of the epoch

    progress_bar = tqdm(train_dataloader, desc="Training")
    for i, batch in enumerate(progress_bar):
        # Move batch to GPU
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        # Scale loss for gradient accumulation
        loss = loss / ACCUMULATION_STEPS
        total_train_loss += loss.item() * ACCUMULATION_STEPS # Un-scale for logging
        loss.backward()

        # Optimizer step
        if (i + 1) % ACCUMULATION_STEPS == 0 or (i + 1) == len(train_dataloader):
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        progress_bar.set_postfix({'loss': loss.item() * ACCUMULATION_STEPS})

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # --- Validation Phase ---
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validating"):
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Validation Loss: {avg_val_loss:.4f}")


Step 6: Starting training process...

--- Epoch 1 / 5 ---


Training: 100%|██████████| 40/40 [00:22<00:00,  1.80it/s, loss=0.108]


Average Training Loss: 0.2586


Validating: 100%|██████████| 5/5 [00:00<00:00,  5.42it/s]


Validation Loss: 0.0537

--- Epoch 2 / 5 ---


Training: 100%|██████████| 40/40 [00:22<00:00,  1.81it/s, loss=0.0398]


Average Training Loss: 0.0416


Validating: 100%|██████████| 5/5 [00:00<00:00,  5.29it/s]


Validation Loss: 0.0394

--- Epoch 3 / 5 ---


Training: 100%|██████████| 40/40 [00:22<00:00,  1.76it/s, loss=0.0223]


Average Training Loss: 0.0314


Validating: 100%|██████████| 5/5 [00:00<00:00,  5.10it/s]


Validation Loss: 0.0333

--- Epoch 4 / 5 ---


Training: 100%|██████████| 40/40 [00:23<00:00,  1.73it/s, loss=0.0256]


Average Training Loss: 0.0247


Validating: 100%|██████████| 5/5 [00:00<00:00,  5.12it/s]


Validation Loss: 0.0322

--- Epoch 5 / 5 ---


Training: 100%|██████████| 40/40 [00:22<00:00,  1.75it/s, loss=0.0113]


Average Training Loss: 0.0216


Validating: 100%|██████████| 5/5 [00:00<00:00,  5.18it/s]


Validation Loss: 0.0322


In [13]:
print("\nStep 7: Training complete. Saving model to Google Drive...")

# Save the fine-tuned model weights and configuration
model.save_pretrained(SAVE_PATH)
# Save the tokenizer for easy reloading
tokenizer.save_pretrained(SAVE_PATH)

print(f"Model saved successfully to '{SAVE_PATH}'")
print("\nFine-tuning process finished!")


Step 7: Training complete. Saving model to Google Drive...
Model saved successfully to '/content/drive/MyDrive/PokemonNERModel'

Fine-tuning process finished!


In [15]:
print("\nStep 8: Evaluating model on the test set...")

# --- Import here to make this cell self-contained ---
from sklearn.metrics import classification_report

# --- Load the saved model and tokenizer from Drive ---
# This ensures we're testing the exact model that was saved
print(f"Loading fine-tuned model from '{SAVE_PATH}'...")
model = BertForTokenClassification.from_pretrained(SAVE_PATH)
tokenizer = BertTokenizerFast.from_pretrained(SAVE_PATH)
model.to(DEVICE)
print("Model and tokenizer loaded successfully.")

# --- Create Test Dataset and Dataloader ---
test_dataset = PokemonNERDataset(test_data, tokenizer, tag2id, MAX_LENGTH)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# --- Evaluation Loop ---
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        labels = batch.pop("labels") # Pop labels to avoid including them in model input

        outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)

        # Align predictions and labels, ignoring -100 tokens
        for i in range(labels.shape[0]):
            for pred, label in zip(predictions[i], labels[i]):
                if label != -100: # Only consider non-special tokens
                    all_preds.append(id2tag[pred.item()])
                    all_labels.append(id2tag[label.item()])

# --- Print Classification Report ---
print("\n--- Classification Report ---")
# Filter out 'O' tag for a more focused report if desired, but included here for completeness
report = classification_report(all_labels, all_preds)
print(report)

# --- Show Some Example Predictions ---
print("\n--- Example Predictions ---")
for i in range(min(5, len(test_data))): # Show up to 5 examples
    example = test_data[i]
    text = example['text']
    print(f"\nText: {text}")

    # Prepare input for the model
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    # Get predictions
    with torch.no_grad():
        logits = model(**inputs).logits

    predictions = torch.argmax(logits, dim=2)
    predicted_tags = [id2tag[p.item()] for p in predictions[0]]

    print("Token\t\tPredicted Label")
    print("-----\t\t---------------")
    for token, tag in zip(tokens, predicted_tags):
        if token not in ('[CLS]', '[SEP]', '[PAD]'):
            print(f"{token:<15}\t{tag}")

print("\nEvaluation complete.")




Step 8: Evaluating model on the test set...
Loading fine-tuned model from '/content/drive/MyDrive/PokemonNERModel'...
Model and tokenizer loaded successfully.


Testing: 100%|██████████| 5/5 [00:01<00:00,  4.13it/s]


--- Classification Report ---
                    precision    recall  f1-score   support

   B-ENEMY_SPECIES       0.88      0.88      0.88        34
B-FRIENDLY_SPECIES       0.85      0.85      0.85        26
                 O       1.00      1.00      1.00      1200

          accuracy                           0.99      1260
         macro avg       0.91      0.91      0.91      1260
      weighted avg       0.99      0.99      0.99      1260


--- Example Predictions ---

Text: The final component for the new radar system has arrived and is ready for installation. The engineering corps will begin work at dawn. Ensure the perimeter is secure for them. While you are at it, there is the small matter of the Mewtwo that is currently tearing apart the old barracks. It needs to be dealt with. Eliminate it. The Bulbasaurs in the general area are not a concern, leave them be.
Token		Predicted Label
-----		---------------
The            	O
final          	O
component      	O
for          


