# BERT Model for Named Entity Recognition (NER)

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForTokenClassification
from sklearn.metrics import accuracy_score, classification_report
import re
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
from tqdm import tqdm

In [2]:
# Load dataset
df = pd.read_csv('../../Datasets/FINAL/DATASET_BERT_CHUNKED.csv')
df.head()

Unnamed: 0,text,labels,original_row_id,chunk_number,total_chunks,token_count
0,PUTUSAN Nomor 192/Pid. B/2019/PN Bkl DEMI KEAD...,O O B_VERN I_VERN I_VERN O O O O O O O O O O O...,0,0,88,342
1,"Dalam perkara ini, Terdakwa ditangkap oleh Pen...",O O O O O O O O O O O O O O O O O O O O O O O ...,0,1,88,342
2,Terdakwa serta memperhatikan Alat Bukti dan ba...,O O O O O O O O O O O O O O O O O O O O O O O ...,0,2,88,326
3,6 (enam) tahun dan 6 (enam) bulan dikurangi se...,B_PENA I_PENA I_PENA O O O O O O O O O O O O O...,0,3,88,367
4,rupiah); Setelah mendengar permohonan Terdakwa...,O O O O O O O O O O O O O O O O O O O O O O O ...,0,4,88,315


In [3]:
# Check dataset information
print(f"Dataset shape: {df.shape}")
print("\nColumns:", df.columns.tolist())
print("\nNull values:\n", df.isnull().sum())

Dataset shape: (4930, 6)

Columns: ['text', 'labels', 'original_row_id', 'chunk_number', 'total_chunks', 'token_count']

Null values:
 text               0
labels             0
original_row_id    0
chunk_number       0
total_chunks       0
token_count        0
dtype: int64


In [4]:
# Process the unique labels
unique_labels = set()
for labels in df['labels'].str.split():
    unique_labels.update(labels)
unique_labels = sorted(list(unique_labels))
print(f"Unique labels: {unique_labels}")

# Create label to id mapping
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for i, label in enumerate(unique_labels)}

print(f"\nLabel to ID mapping: {label_to_id}")

Unique labels: ['B_ARTV', 'B_CRIA', 'B_DEFN', 'B_JUDG', 'B_JUDP', 'B_PENA', 'B_PROS', 'B_PUNI', 'B_REGI', 'B_TIMV', 'B_VERN', 'I_ARTV', 'I_CRIA', 'I_DEFN', 'I_JUDG', 'I_JUDP', 'I_PENA', 'I_PROS', 'I_PUNI', 'I_REGI', 'I_TIMV', 'I_VERN', 'O']

Label to ID mapping: {'B_ARTV': 0, 'B_CRIA': 1, 'B_DEFN': 2, 'B_JUDG': 3, 'B_JUDP': 4, 'B_PENA': 5, 'B_PROS': 6, 'B_PUNI': 7, 'B_REGI': 8, 'B_TIMV': 9, 'B_VERN': 10, 'I_ARTV': 11, 'I_CRIA': 12, 'I_DEFN': 13, 'I_JUDG': 14, 'I_JUDP': 15, 'I_PENA': 16, 'I_PROS': 17, 'I_PUNI': 18, 'I_REGI': 19, 'I_TIMV': 20, 'I_VERN': 21, 'O': 22}


In [5]:
# Create dataset class
class NERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx].split()
        
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        # Align the labels with tokens (handling word pieces)
        tokens = self.tokenizer.tokenize(text)
        token_labels = []
        
        text_words = text.split()
        
        # Prepare word to token map
        word_ids = []
        current_word_idx = -1
        
        for token_idx, token in enumerate(tokens):
            if token.startswith("##"):
                # This is a continuation of the previous word
                word_ids.append(current_word_idx)
            else:
                # This is a new word
                current_word_idx += 1
                word_ids.append(current_word_idx)
                
            if current_word_idx >= len(labels):
                break
                
        # Convert labels to IDs and align with tokens
        label_ids = [-100] * self.max_len  # -100 is ignored by PyTorch loss functions
        
        # Add [CLS] token label
        label_ids[0] = -100
        
        for token_idx, word_idx in enumerate(word_ids):
            if word_idx < len(labels) and token_idx + 1 < self.max_len:
                label_ids[token_idx + 1] = label_to_id[labels[word_idx]]
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

In [8]:
# Load pretrained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')
model = BertForTokenClassification.from_pretrained(
    'indolem/indobert-base-uncased',
    num_labels=len(label_to_id)
)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [9]:
# Split the dataset
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Training set size: {train_df.shape[0]}")
print(f"Test set size: {test_df.shape[0]}")

# Create datasets
train_dataset = NERDataset(
    train_df['text'].tolist(),
    train_df['labels'].tolist(),
    tokenizer
)

test_dataset = NERDataset(
    test_df['text'].tolist(),
    test_df['labels'].tolist(),
    tokenizer
)

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False
)

Training set size: 3944
Test set size: 986


In [10]:
# Training function
def train(model, dataloader, optimizer, device, epoch):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
    
    return total_loss / len(dataloader)

In [11]:
# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    
    true_labels = []
    predicted_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=2)
            
            # Remove padding and ignored tokens
            for i in range(labels.shape[0]):
                true_seq = []
                pred_seq = []
                for j in range(labels.shape[1]):
                    if labels[i, j] != -100:
                        true_seq.append(id_to_label[labels[i, j].item()])
                        pred_seq.append(id_to_label[predictions[i, j].item()])
                
                true_labels.append(true_seq)
                predicted_labels.append(pred_seq)
    
    return true_labels, predicted_labels

In [12]:
# Train the model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

for epoch in range(num_epochs):
    avg_loss = train(model, train_loader, optimizer, device, epoch)
    print(f"Epoch {epoch+1}/{num_epochs} - Average loss: {avg_loss:.4f}")

Epoch 1:   1%|          | 5/493 [00:12<21:44,  2.67s/it, loss=0.5017]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 493/493 [20:46<00:00,  2.53s/it, loss=0.2563]


Epoch 1/3 - Average loss: 0.3412


Epoch 2: 100%|██████████| 493/493 [20:28<00:00,  2.49s/it, loss=0.1789]


Epoch 2/3 - Average loss: 0.2134


Epoch 3: 100%|██████████| 493/493 [20:35<00:00,  2.51s/it, loss=0.1074]

Epoch 3/3 - Average loss: 0.1826





In [13]:
# Evaluate the model
true_labels, predicted_labels = evaluate(model, test_loader, device)

# Print performance metrics
print("F1 Score:", f1_score(true_labels, predicted_labels))
print("Precision:", precision_score(true_labels, predicted_labels))
print("Recall:", recall_score(true_labels, predicted_labels))
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

Evaluating: 100%|██████████| 124/124 [01:14<00:00,  1.66it/s]


F1 Score: 0.09770992366412214
Precision: 0.15194681861348527
Recall: 0.07200720072007201

Classification Report:
              precision    recall  f1-score   support

       _ARTV       0.00      0.00      0.00       252
       _CRIA       0.09      0.03      0.05       120
       _DEFN       0.21      0.09      0.12       927
       _JUDG       0.00      0.00      0.00       181
       _JUDP       0.05      0.04      0.04       113
       _PENA       0.03      0.03      0.03        67
       _PROS       0.06      0.02      0.03        89
       _PUNI       0.20      0.10      0.13        93
       _REGI       0.03      0.03      0.03       109
       _TIMV       0.08      0.03      0.04        63
       _VERN       0.55      0.25      0.34       208

   micro avg       0.15      0.07      0.10      2222
   macro avg       0.12      0.06      0.07      2222
weighted avg       0.16      0.07      0.10      2222



In [14]:
# Save the model
model_save_path = './models/bert_ner_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to ./models/bert_ner_model


In [15]:
# Function to predict NER for new text
def predict_ner(text, model, tokenizer, device):
    model.eval()
    
    # Tokenize the text
    encoded_input = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )
    
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)
    
    # Make prediction
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    
    # Convert predictions to labels
    predicted_labels = []
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    
    for token, prediction in zip(tokens, predictions[0]):
        if token in [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]:
            continue
            
        predicted_label = id_to_label[prediction.item()]
        predicted_labels.append((token, predicted_label))
    
    return predicted_labels

In [19]:
# Test with a new document
sample_text = """
PUTUSAN Nomor 123/Pid. B/2020/PN Jkt DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA
Pengadilan Negeri Jakarta yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam 
tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa:
Nama lengkap : Budi Santoso;
Tempat lahir : Jakarta;
Umur/tanggal lahir : 35 Tahun/10 Januari 1985;
Jenis kelamin : Laki-laki;
Kebangsaan : Indonesia;
"""

predicted_entities = predict_ner(sample_text, model, tokenizer, device)

# Display results
for token, label in predicted_entities:
    if label != 'O':  # Only show named entities
        print(f"{token}: {label}")

123: B_VERN
/: I_VERN
pid: I_VERN
dalam: B_DEFN


In [17]:
# Visualize the entities in a more structured way
from IPython.display import display, HTML
import pandas as pd

def visualize_entities(text, model, tokenizer, device):
    # Get predictions
    encoded_input = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )
    
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    
    # Get tokens and predictions
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    token_predictions = [id_to_label[p.item()] for p in predictions[0]]
    
    # Create a dataframe for visualization
    df = pd.DataFrame({
        'Token': tokens,
        'Prediction': token_predictions
    })
    
    # Filter out special tokens
    special_tokens = [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]
    df = df[~df['Token'].isin(special_tokens)]
    
    # Only show entities (not O)
    entity_df = df[df['Prediction'] != 'O']
    
    # Group entities
    entities = []
    current_entity = None
    current_type = None
    
    for i, row in df.iterrows():
        if row['Prediction'].startswith('B_'):
            if current_entity:
                entities.append((current_entity, current_type))
            current_entity = row['Token']
            current_type = row['Prediction'][2:]  # Remove B_ prefix
        elif row['Prediction'].startswith('I_') and current_entity and row['Prediction'][2:] == current_type:
            current_entity += " " + row['Token'].replace('##', '')
        elif row['Prediction'] == 'O':
            if current_entity:
                entities.append((current_entity, current_type))
                current_entity = None
                current_type = None
    
    if current_entity:
        entities.append((current_entity, current_type))
    
    # Create a result dataframe
    result_df = pd.DataFrame(entities, columns=['Entity', 'Type'])
    
    return result_df

# Test with sample text
result = visualize_entities(sample_text, model, tokenizer, device)
display(result)

Unnamed: 0,Entity,Type
0,123 / pid,VERN
1,dalam,DEFN


In [18]:
# Load a full document for testing
def process_document(text, model, tokenizer, device, chunk_size=400):
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    
    all_entities = []
    
    for chunk in chunks:
        chunk_entities = visualize_entities(chunk, model, tokenizer, device)
        all_entities.append(chunk_entities)
    
    # Combine results
    if all_entities:
        return pd.concat(all_entities, ignore_index=True)
    else:
        return pd.DataFrame(columns=['Entity', 'Type'])

# Use a sample from the dataset
document_text = df['text'].iloc[0]

# Process the document
document_entities = process_document(document_text, model, tokenizer, device)
display(document_entities)

Unnamed: 0,Entity,Type
0,192 / pid,VERN
1,dalam perkara terdakwa : nama,DEFN
