In [1]:
# Install required packages if not already installed
# !pip install transformers torch

from transformers import AutoTokenizer
import torch
import numpy as np
from typing import List, Tuple

# Initialize BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
print("Tokenizer loaded successfully!")
print(f"Max model input size: {tokenizer.model_max_length}")
print(f"Special tokens: {tokenizer.special_tokens_map}")

Tokenizer loaded successfully!
Max model input size: 512
Special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


In [2]:
# Install required packages for NER implementation
# !pip install transformers torch scikit-learn seqeval pandas numpy matplotlib seaborn tqdm
# !pip install datasets accelerate

---


In [3]:
import pandas as pd

In [4]:
main_df = pd.read_csv('../../Datasets/FINAL/DATASET_BERT_SENTENCE.csv')
main_df

Unnamed: 0,text,labels
0,PUTUSAN Nomor 192/Pid. B/2019/PN Bkl DEMI KEAD...,O O B_VERN I_VERN I_VERN O O O O O O O O O O O...
1,PUTUSAN Nomor 9/Pid. B/2019/PN Bkl DEMI KEADIL...,O O B_VERN I_VERN I_VERN O O O O O O O O O O O...
2,PUTUSAN Nomor 108/Pid. B/2020/PN Bkl DEMI KEAD...,O O B_VERN I_VERN I_VERN O O O O O O O O O O O...
3,PUTUSAN NOMOR : 294/Pid. B/2018/PN. BKL. DEMI ...,O O O B_VERN I_VERN I_VERN O O O O O O O O O O...
4,PUTUSAN Nomor 431/Pid. B/2018/PN Bkl DEMI KEAD...,O O O O O O O O O O O O O O O O O O O O O O O ...
...,...,...
230,PUTUSAN Nomor 31/Pid. B/2025/PN Bkl DEMI KEADI...,O O B_VERN I_VERN I_VERN O O O O O O O O O O O...
231,PUTUSAN Nomor 47/Pid. B/2025/PN Bkl DEMI KEADI...,O O B_VERN I_VERN I_VERN O O O O O O O O O O O...
232,PUTUSAN Nomor 76/Pid. B/2025/PN Bkl DEMI KEADI...,O O O O O O O O O O O O O O O O O O O O O O O ...
233,PUTUSAN Nomor 77/Pid. B/2025/PN Bkl DEMI KEADI...,O O B_VERN I_VERN I_VERN O O O O O O O O O O O...


In [5]:
def tokenize_and_chunk_text(text: str, labels: str, max_length: int = 512, overlap: int = 50) -> List[Tuple[str, str]]:

    # Split text and labels into tokens/words
    words = text.split()
    label_list = labels.split()
    
    # Ensure words and labels have the same length
    if len(words) != len(label_list):
        print(f"Warning: Mismatch between words ({len(words)}) and labels ({len(label_list)})")
        # Truncate to minimum length to avoid index errors
        min_len = min(len(words), len(label_list))
        words = words[:min_len]
        label_list = label_list[:min_len]
    
    chunks = []
    
    # Calculate effective max length (accounting for special tokens [CLS], [SEP])
    # Use even more conservative estimate to ensure we stay under 512 tokens
    effective_max_length = max_length // 4  # Very conservative: assume ~4 tokens per word
    
    start_idx = 0
    while start_idx < len(words):
        # Calculate end index for current chunk
        end_idx = min(start_idx + effective_max_length, len(words))
        
        # Extract chunk
        chunk_words = words[start_idx:end_idx]
        chunk_labels = label_list[start_idx:end_idx]
        
        # Convert back to strings
        chunk_text = ' '.join(chunk_words)
        chunk_labels_str = ' '.join(chunk_labels)
        
        chunks.append((chunk_text, chunk_labels_str))
        
        # Move to next chunk with overlap
        if end_idx >= len(words):
            break
        start_idx = end_idx - overlap
        
        # Ensure we don't go backwards
        if start_idx <= 0:
            start_idx = end_idx
    
    return chunks

def validate_chunk_tokens(text: str, max_length: int = 512) -> Tuple[int, bool]:

    # Tokenize the text
    tokens = tokenizer.encode(text, add_special_tokens=True, truncation=False)
    token_count = len(tokens)
    is_valid = token_count <= max_length
    
    return token_count, is_valid


In [6]:
# Test the chunking function on a sample
print("Testing tokenization and chunking on sample data:")
print("=" * 50)

# Get a sample with long text
sample_idx = main_df['text'].str.len().idxmax()
sample_text = main_df.iloc[sample_idx]['text']
sample_labels = main_df.iloc[sample_idx]['labels']

print(f"Original text length: {len(sample_text)} characters")
print(f"Original word count: {len(sample_text.split())} words")

# Check original token count
original_tokens, is_valid = validate_chunk_tokens(sample_text)
print(f"Original token count: {original_tokens} (Valid: {is_valid})")

# Chunk the text
chunks = tokenize_and_chunk_text(sample_text, sample_labels, max_length=512, overlap=50)
print(f"\nNumber of chunks created: {len(chunks)}")

# Validate each chunk
print("\nChunk validation:")
for i, (chunk_text, chunk_labels) in enumerate(chunks):
    token_count, is_valid = validate_chunk_tokens(chunk_text)
    word_count = len(chunk_text.split())
    print(f"Chunk {i+1}: {word_count} words, {token_count} tokens (Valid: {is_valid})")
    
    if i == 0:  # Show first chunk as example
        print(f"  First chunk preview: {chunk_text[:200]}...")
        print(f"  First chunk labels: {chunk_labels[:200]}...")

Token indices sequence length is longer than the specified maximum sequence length for this model (65679 > 512). Running this sequence through the model will result in indexing errors


Testing tokenization and chunking on sample data:
Original text length: 176599 characters
Original word count: 26277 words
Original token count: 65679 (Valid: False)

Number of chunks created: 337

Chunk validation:
Chunk 1: 128 words, 305 tokens (Valid: True)
  First chunk preview: PUTUSAN Nomor 48/Pid. B/2023/PN Bkl DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Bangkalan yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama men...
  First chunk labels: O O B_VERN I_VERN I_VERN O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B_DEFN I_DEFN I_DEFN I_DEFN O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O...
Chunk 2: 128 words, 285 tokens (Valid: True)
Chunk 3: 128 words, 279 tokens (Valid: True)
Chunk 4: 128 words, 318 tokens (Valid: True)
Chunk 5: 128 words, 337 tokens (Valid: True)
Chunk 6: 128 words, 303 tokens (Valid: True)
Chunk 7: 128 words, 286 tokens (Valid: True)
Chunk 8: 128 wor

In [7]:
# Process the entire dataset
print("Processing entire dataset with tokenization and chunking...")
print("=" * 60)

chunked_data = []
original_row_ids = []
chunk_numbers = []

for idx, row in main_df.iterrows():
    text = row['text']
    labels = row['labels']
    
    # Check if chunking is needed
    token_count, is_valid = validate_chunk_tokens(text)
    
    if is_valid:
        # No chunking needed
        chunked_data.append({
            'text': text,
            'labels': labels,
            'original_row_id': idx,
            'chunk_number': 0,
            'total_chunks': 1,
            'token_count': token_count
        })
    else:
        # Chunking needed
        chunks = tokenize_and_chunk_text(text, labels, max_length=512, overlap=50)
        
        for chunk_idx, (chunk_text, chunk_labels) in enumerate(chunks):
            chunk_token_count, _ = validate_chunk_tokens(chunk_text)
            
            chunked_data.append({
                'text': chunk_text,
                'labels': chunk_labels,
                'original_row_id': idx,
                'chunk_number': chunk_idx,
                'total_chunks': len(chunks),
                'token_count': chunk_token_count
            })
    
    # Progress indicator
    if (idx + 1) % 50 == 0:
        print(f"Processed {idx + 1}/{len(main_df)} rows...")

# Create new DataFrame with chunked data
chunked_df = pd.DataFrame(chunked_data)

print(f"\nProcessing complete!")
print(f"Original dataset: {len(main_df)} rows")
print(f"Chunked dataset: {len(chunked_df)} rows")
print(f"Expansion factor: {len(chunked_df) / len(main_df):.2f}x")

Processing entire dataset with tokenization and chunking...
Processed 50/235 rows...
Processed 100/235 rows...
Processed 150/235 rows...
Processed 200/235 rows...

Processing complete!
Original dataset: 235 rows
Chunked dataset: 19387 rows
Expansion factor: 82.50x


In [8]:
# Analyze the chunked dataset
print("Chunked Dataset Analysis:")
print("=" * 40)

# Basic statistics
print(f"Total rows: {len(chunked_df)}")
print(f"Rows that needed chunking: {len(chunked_df[chunked_df['total_chunks'] > 1])}")
print(f"Rows that didn't need chunking: {len(chunked_df[chunked_df['total_chunks'] == 1])}")

# Token count statistics
print(f"\nToken Count Statistics:")
print(f"Min tokens: {chunked_df['token_count'].min()}")
print(f"Max tokens: {chunked_df['token_count'].max()}")
print(f"Mean tokens: {chunked_df['token_count'].mean():.2f}")
print(f"Median tokens: {chunked_df['token_count'].median():.2f}")

# Check if all chunks are within token limit
valid_chunks = chunked_df['token_count'] <= 512
print(f"\nAll chunks within 512 token limit: {valid_chunks.all()}")
if not valid_chunks.all():
    invalid_chunks = chunked_df[~valid_chunks]
    print(f"Invalid chunks found: {len(invalid_chunks)}")
    print(invalid_chunks[['original_row_id', 'chunk_number', 'token_count']].head())

# Display sample of chunked data
print(f"\nSample of chunked dataset:")
print(chunked_df[['original_row_id', 'chunk_number', 'total_chunks', 'token_count']].head(10))

# Show distribution of chunk counts
chunk_distribution = chunked_df.groupby('original_row_id')['total_chunks'].first().value_counts().sort_index()
print(f"\nDistribution of chunks per original row:")
for chunks, count in chunk_distribution.items():
    print(f"  {chunks} chunk(s): {count} rows")

Chunked Dataset Analysis:
Total rows: 19387
Rows that needed chunking: 19387
Rows that didn't need chunking: 0

Token Count Statistics:
Min tokens: 119
Max tokens: 411
Mean tokens: 320.56
Median tokens: 321.00

All chunks within 512 token limit: True

Sample of chunked dataset:
   original_row_id  chunk_number  total_chunks  token_count
0                0             0            88          342
1                0             1            88          342
2                0             2            88          326
3                0             3            88          367
4                0             4            88          315
5                0             5            88          352
6                0             6            88          337
7                0             7            88          306
8                0             8            88          279
9                0             9            88          310

Distribution of chunks per original row:
  30 chunk(s): 2 ro

In [9]:
chunked_df

Unnamed: 0,text,labels,original_row_id,chunk_number,total_chunks,token_count
0,PUTUSAN Nomor 192/Pid. B/2019/PN Bkl DEMI KEAD...,O O B_VERN I_VERN I_VERN O O O O O O O O O O O...,0,0,88,342
1,"Dalam perkara ini, Terdakwa ditangkap oleh Pen...",O O O O O O O O O O O O O O O O O O O O O O O ...,0,1,88,342
2,Terdakwa serta memperhatikan Alat Bukti dan ba...,O O O O O O O O O O O O O O O O O O O O O O O ...,0,2,88,326
3,6 (enam) tahun dan 6 (enam) bulan dikurangi se...,B_PENA I_PENA I_PENA O O O O O O O O O O O O O...,0,3,88,367
4,rupiah); Setelah mendengar permohonan Terdakwa...,O O O O O O O O O O O O O O O O O O O O O O O ...,0,4,88,315
...,...,...,...,...,...,...
19382,-abu; yang telah dipergunakan untuk melakukan ...,O O O O O O O O O O O O O O O O O O O O O O O ...,234,84,89,313
19383,lain; Keadaan yang meringankan : - Tidak Ada ;...,O O O O O O O O O O O O O O O O O O O O O O O ...,234,85,89,329
19384,menyebabkan luka -luka berat ” sebagaimana dal...,O O O O O O O O O O O O O O O O O O O O O O B_...,234,86,89,322
19385,"hitam liris -liris warna krem, biru ungu merah...",O O O O O O O O O O O O O O O O O O O O O O O ...,234,87,89,286


In [10]:
# Save the chunked dataset
output_path = '../../Datasets/FINAL/DATASET_BERT_CHUNKED.csv'

# Drop all rows that contain only 'O' annotation in the labels
chunked_df = chunked_df[~chunked_df['labels'].apply(lambda x: all(label == 'O' for label in x.split()))].reset_index(drop=True)


chunked_df.to_csv(output_path, index=False)



print(f"Chunked dataset saved to: {output_path}")

# Display final dataset structure
print(f"\nFinal chunked dataset:")
print(chunked_df.head())

# Show some examples of chunked vs original data
print(f"\nExample of chunking (first row that was split):")
multi_chunk_rows = chunked_df[chunked_df['total_chunks'] > 1]['original_row_id'].unique()
if len(multi_chunk_rows) > 0:
    example_row_id = multi_chunk_rows[0]
    original_example = main_df.iloc[example_row_id]
    chunked_examples = chunked_df[chunked_df['original_row_id'] == example_row_id]
    
    print(f"\nOriginal row {example_row_id}:")
    print(f"  Length: {len(original_example['text'])} chars, {len(original_example['text'].split())} words")
    print(f"  Preview: {original_example['text'][:200]}...")
    
    print(f"\nSplit into {len(chunked_examples)} chunks:")
    for _, chunk in chunked_examples.iterrows():
        print(f"  Chunk {chunk['chunk_number']}: {len(chunk['text'].split())} words, {chunk['token_count']} tokens")
        print(f"    Preview: {chunk['text'][:150]}...")
        print()

Chunked dataset saved to: ../../Datasets/FINAL/DATASET_BERT_CHUNKED.csv

Final chunked dataset:
                                                text  \
0  PUTUSAN Nomor 192/Pid. B/2019/PN Bkl DEMI KEAD...   
1  Dalam perkara ini, Terdakwa ditangkap oleh Pen...   
2  Terdakwa serta memperhatikan Alat Bukti dan ba...   
3  6 (enam) tahun dan 6 (enam) bulan dikurangi se...   
4  rupiah); Setelah mendengar permohonan Terdakwa...   

                                              labels  original_row_id  \
0  O O B_VERN I_VERN I_VERN O O O O O O O O O O O...                0   
1  O O O O O O O O O O O O O O O O O O O O O O O ...                0   
2  O O O O O O O O O O O O O O O O O O O O O O O ...                0   
3  B_PENA I_PENA I_PENA O O O O O O O O O O O O O...                0   
4  O O O O O O O O O O O O O O O O O O O O O O O ...                0   

   chunk_number  total_chunks  token_count  
0             0            88          342  
1             1            88         

In [11]:
# Helper functions for working with chunked data during training

def prepare_bert_input(chunked_df, tokenizer, max_length=512):
    """
    Prepare tokenized inputs for BERT training from chunked dataset.
    
    Args:
        chunked_df: DataFrame with chunked text and labels
        tokenizer: BERT tokenizer
        max_length: Maximum sequence length
    
    Returns:
        Dictionary with input_ids, attention_mask, and labels
    """
    texts = chunked_df['text'].tolist()
    labels = chunked_df['labels'].tolist()
    
    # Tokenize texts
    encodings = tokenizer(texts, 
                            truncation=True, 
                            padding=True, 
                            max_length=max_length, 
                            return_tensors='pt')
    
    # Process labels (convert to label IDs if needed)
    processed_labels = []
    for label_str in labels:
        label_tokens = label_str.split()
        processed_labels.append(label_tokens)
    
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': processed_labels,
        'original_row_ids': chunked_df['original_row_id'].tolist(),
        'chunk_numbers': chunked_df['chunk_number'].tolist()
    }

def reconstruct_predictions(predictions, chunked_df, overlap=50):
    """
    Reconstruct full document predictions from chunked predictions.
    
    Args:
        predictions: List of predictions for each chunk
        chunked_df: DataFrame with chunked data
        overlap: Number of overlapping tokens between chunks
    
    Returns:
        Dictionary mapping original_row_id to reconstructed predictions
    """
    reconstructed = {}
    
    for original_row_id in chunked_df['original_row_id'].unique():
        chunk_data = chunked_df[chunked_df['original_row_id'] == original_row_id].sort_values('chunk_number')
        
        if len(chunk_data) == 1:
            # Single chunk, no reconstruction needed
            chunk_idx = chunk_data.index[0]
            reconstructed[original_row_id] = predictions[chunk_idx]
        else:
            # Multiple chunks, need to reconstruct
            reconstructed_pred = []
            
            for i, (_, chunk_row) in enumerate(chunk_data.iterrows()):
                chunk_idx = chunk_row.name
                chunk_pred = predictions[chunk_idx]
                
                if i == 0:
                    # First chunk: take all predictions
                    reconstructed_pred.extend(chunk_pred)
                else:
                    # Subsequent chunks: skip overlap tokens
                    reconstructed_pred.extend(chunk_pred[overlap:])
            
            reconstructed[original_row_id] = reconstructed_pred
    
    return reconstructed

print("Helper functions for BERT training with chunked data created!")
print("\nYou can now use the chunked dataset for training without token overflow issues.")
print("Key functions available:")
print("- prepare_bert_input(): Prepare data for BERT training")
print("- reconstruct_predictions(): Reconstruct full predictions from chunks")

Helper functions for BERT training with chunked data created!

You can now use the chunked dataset for training without token overflow issues.
Key functions available:
- prepare_bert_input(): Prepare data for BERT training
- reconstruct_predictions(): Reconstruct full predictions from chunks


In [12]:
# Final analysis of chunked dataset
print("Final Chunked Dataset Analysis:")
print("=" * 50)

# Basic statistics
print(f"Total rows: {len(chunked_df)}")
print(f"Rows that needed chunking: {len(chunked_df[chunked_df['total_chunks'] > 1])}")
print(f"Rows that didn't need chunking: {len(chunked_df[chunked_df['total_chunks'] == 1])}")

# Token count statistics
print(f"\nToken Count Statistics:")
print(f"Min tokens: {chunked_df['token_count'].min()}")
print(f"Max tokens: {chunked_df['token_count'].max()}")
print(f"Mean tokens: {chunked_df['token_count'].mean():.2f}")
print(f"Median tokens: {chunked_df['token_count'].median():.2f}")

# Check if all chunks are within token limit
valid_chunks = chunked_df['token_count'] <= 512
print(f"\nAll chunks within 512 token limit: {valid_chunks.all()}")
if not valid_chunks.all():
    invalid_chunks = chunked_df[~valid_chunks]
    print(f"Invalid chunks found: {len(invalid_chunks)}")
    print(invalid_chunks[['original_row_id', 'chunk_number', 'token_count']].head())
else:
    print("✅ All chunks are valid!")

# Save the chunked dataset
output_path = '../../Datasets/FINAL/DATASET_BERT_CHUNKED.csv'
chunked_df.to_csv(output_path, index=False)
print(f"\n💾 Chunked dataset saved to: {output_path}")

# Show final statistics
print(f"\n📊 Final Statistics:")
print(f"   Original rows: 235")
print(f"   Chunked rows: {len(chunked_df):,}")
print(f"   Expansion factor: {len(chunked_df) / 235:.1f}x")
print(f"   Average tokens per chunk: {chunked_df['token_count'].mean():.0f}")
print(f"   Token efficiency: {chunked_df['token_count'].mean() / 512 * 100:.1f}%")

print("\n🎉 Tokenization and chunking process completed successfully!")
print("The dataset is now ready for BERT training without token overflow issues.")

Final Chunked Dataset Analysis:
Total rows: 4930
Rows that needed chunking: 4930
Rows that didn't need chunking: 0

Token Count Statistics:
Min tokens: 119
Max tokens: 405
Mean tokens: 318.11
Median tokens: 323.00

All chunks within 512 token limit: True
✅ All chunks are valid!

💾 Chunked dataset saved to: ../../Datasets/FINAL/DATASET_BERT_CHUNKED.csv

📊 Final Statistics:
   Original rows: 235
   Chunked rows: 4,930
   Expansion factor: 21.0x
   Average tokens per chunk: 318
   Token efficiency: 62.1%

🎉 Tokenization and chunking process completed successfully!
The dataset is now ready for BERT training without token overflow issues.


----

In [13]:
temp = pd.read_csv('../../Datasets/FINAL/DATASET_BERT_CHUNKED.csv')

In [14]:
temp

Unnamed: 0,text,labels,original_row_id,chunk_number,total_chunks,token_count
0,PUTUSAN Nomor 192/Pid. B/2019/PN Bkl DEMI KEAD...,O O B_VERN I_VERN I_VERN O O O O O O O O O O O...,0,0,88,342
1,"Dalam perkara ini, Terdakwa ditangkap oleh Pen...",O O O O O O O O O O O O O O O O O O O O O O O ...,0,1,88,342
2,Terdakwa serta memperhatikan Alat Bukti dan ba...,O O O O O O O O O O O O O O O O O O O O O O O ...,0,2,88,326
3,6 (enam) tahun dan 6 (enam) bulan dikurangi se...,B_PENA I_PENA I_PENA O O O O O O O O O O O O O...,0,3,88,367
4,rupiah); Setelah mendengar permohonan Terdakwa...,O O O O O O O O O O O O O O O O O O O O O O O ...,0,4,88,315
...,...,...,...,...,...,...
4925,-abu; yang telah dipergunakan untuk melakukan ...,O O O O O O O O O O O O O O O O O O O O O O O ...,234,84,89,313
4926,lain; Keadaan yang meringankan : - Tidak Ada ;...,O O O O O O O O O O O O O O O O O O O O O O O ...,234,85,89,329
4927,menyebabkan luka -luka berat ” sebagaimana dal...,O O O O O O O O O O O O O O O O O O O O O O B_...,234,86,89,322
4928,"hitam liris -liris warna krem, biru ungu merah...",O O O O O O O O O O O O O O O O O O O O O O O ...,234,87,89,286
