In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/spm.model
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/config.json
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/pytorch_model.generator.bin
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/README.md
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/tf_model.h5
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/tokenizer_config.json
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/pytorch_model.bin
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/generator_config.json
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/.gitattributes
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/.git/config
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/.git/packed-refs
/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large/.git/HEAD
/kaggle/input/debertav3-large/tr

# Data Formatting

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaV2Tokenizer
import json

# Load your dataset
with open('/kaggle/input/pii-detection-removal-from-educational-data/train.json', 'r') as f:
    train_data1 = json.load(f)

with open('/kaggle/input/pii-detect-illi-train-dataset/illi_data.json', 'r') as f:
    train_data2 = json.load(f)    
    
train_data= train_data1 + train_data2

with open('/kaggle/input/pii-detection-removal-from-educational-data/test.json', 'r') as f:
    test_data = json.load(f)

# Preprocessing function to convert data into input IDs, attention masks, and labels
def preprocess_data(data, tokenizer, max_len, has_labels=True):
    input_ids = []
    attention_masks = []
    labels = []

    for item in data:
        tokens = item['tokens']
        
        if has_labels:
            label_indices = item.get('labels', [])  # Get 'labels' if it exists, else use an empty list
            label_indices = [label_map.get(label, label_map['O']) for label in label_indices]  # Convert labels to IDs
        else:
            label_indices = [label_map['O']] * len(tokens)  # Assign 'O' label to all tokens if labels are not provided

        # Join the tokens into a single string
        text = ' '.join(tokens)

        # Tokenize the text and obtain the encoded representation
        encoded = tokenizer(text,
                            padding='max_length',
                            truncation=True,
                            max_length=max_len,
                            return_attention_mask=True)
        
        # Ensure the encoded output is padded to `max_len`
        input_id = encoded['input_ids']
        attention_mask = encoded['attention_mask']
        
        # Padding the labels to the same length as `max_len`
        label = label_indices + [label_map['O']] * (max_len - len(label_indices))
        label = label[:max_len]  # Truncate if longer than max_len

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        labels.append(label)

    # Stack the input_ids, attention_masks, and labels
    input_ids = torch.stack([torch.tensor(ids) for ids in input_ids])
    attention_masks = torch.stack([torch.tensor(mask) for mask in attention_masks])
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

# Define the PIIDataset class
class PIIDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = [torch.tensor(label, dtype=torch.long) for label in labels]

    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.labels)

def encode_tokens(tokenizer, text, max_length):
    encoding = tokenizer(
        text,
        is_split_into_words=True,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    return encoding

def label_to_id(labels, label_map, max_length):
    label_ids = [label_map[label] if label in label_map else label_map['O'] for label in labels]
    label_ids += [label_map['O']] * (max_length - len(labels))
    return label_ids[:max_length]

# Your label map
label_map = {
    'B-NAME_STUDENT': 0,
    'I-NAME_STUDENT': 1,
    'B-EMAIL': 2,
    'I-EMAIL': 3,
    'B-USERNAME': 4,
    'I-USERNAME': 5,
    'B-ID_NUM': 6,
    'I-ID_NUM': 7,
    'B-PHONE_NUM': 8,
    'I-PHONE_NUM': 9,
    'B-URL_PERSONAL': 10,
    'I-URL_PERSONAL': 11,
    'B-STREET_ADDRESS': 12,
    'I-STREET_ADDRESS': 13,
    'O': 14  # 'O' label for non-PII tokens
}

# Set num_labels to the number of different labels in your label map
num_labels = len(label_map)

# Define your tokenizer and model
tokenizer = DebertaV2Tokenizer.from_pretrained('/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large')

# Preprocess the data
all_tokens = [entry['tokens'] for entry in train_data]
all_labels = [entry['labels'] for entry in train_data]

max_length = 128

train_encodings = encode_tokens(tokenizer, all_tokens, max_length=max_length)
train_labels = [label_to_id(labels, label_map, max_length) for labels in all_labels]
train_dataset = PIIDataset(train_encodings.input_ids, train_encodings.attention_mask, train_labels)

test_tokens = [entry['tokens'] for entry in test_data]
test_encodings = encode_tokens(tokenizer, test_tokens, max_length=max_length)
test_input_ids = test_encodings.input_ids
test_attention_masks = test_encodings.attention_mask
test_labels = [label_to_id(['O']*len(entry['tokens']), label_map, max_length) for entry in test_data]
test_dataset = PIIDataset(test_input_ids, test_attention_masks, test_labels)

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [3]:
# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import DebertaV2Tokenizer
# import json
# from sklearn.model_selection import train_test_split

# # Load your dataset
# with open('/kaggle/input/pii-detection-removal-from-educational-data/train.json', 'r') as f:
#     train_data1 = json.load(f)

# with open('/kaggle/input/pii-detect-illi-train-dataset/illi_data.json', 'r') as f:
#     train_data2 = json.load(f)    
    
# train_data= train_data1 + train_data2

# with open('/kaggle/input/pii-detection-removal-from-educational-data/test.json', 'r') as f:
#     test_data = json.load(f)

# # Preprocessing function to convert data into input IDs, attention masks, and labels
# def preprocess_data(data, tokenizer, max_len, has_labels=True):
#     input_ids = []
#     attention_masks = []
#     labels = []

#     for item in data:
#         tokens = item['tokens']
        
#         if has_labels:
#             label_indices = item.get('labels', [])  # Get 'labels' if it exists, else use an empty list
#             label_indices = [label_map.get(label, label_map['O']) for label in label_indices]  # Convert labels to IDs
#         else:
#             label_indices = [label_map['O']] * len(tokens)  # Assign 'O' label to all tokens if labels are not provided

#         # Join the tokens into a single string
#         text = ' '.join(tokens)

#         # Tokenize the text and obtain the encoded representation
#         encoded = tokenizer(text,
#                             padding='max_length',
#                             truncation=True,
#                             max_length=max_len,
#                             return_attention_mask=True)
        
#         # Ensure the encoded output is padded to `max_len`
#         input_id = encoded['input_ids']
#         attention_mask = encoded['attention_mask']
        
#         # Padding the labels to the same length as `max_len`
#         label = label_indices + [label_map['O']] * (max_len - len(label_indices))
#         label = label[:max_len]  # Truncate if longer than max_len

#         input_ids.append(input_id)
#         attention_masks.append(attention_mask)
#         labels.append(label)

#     # Stack the input_ids, attention_masks, and labels
#     input_ids = torch.stack([torch.tensor(ids) for ids in input_ids])
#     attention_masks = torch.stack([torch.tensor(mask) for mask in attention_masks])
#     labels = torch.tensor(labels)

#     return input_ids, attention_masks, labels

# # Define the PIIDataset class
# class PIIDataset(Dataset):
#     def __init__(self, input_ids, attention_masks, labels):
#         self.input_ids = input_ids
#         self.attention_masks = attention_masks
#         self.labels = [torch.tensor(label, dtype=torch.long) for label in labels]

#     def __getitem__(self, idx):
#         item = {
#             'input_ids': self.input_ids[idx],
#             'attention_mask': self.attention_masks[idx],
#             'labels': self.labels[idx]
#         }
#         return item

#     def __len__(self):
#         return len(self.labels)

# def encode_tokens(tokenizer, text, max_length):
#     encoding = tokenizer(
#         text,
#         is_split_into_words=True,
#         add_special_tokens=True,
#         max_length=max_length,
#         padding='max_length',
#         truncation=True,
#         return_attention_mask=True,
#         return_tensors='pt',
#     )
#     return encoding

# def label_to_id(labels, label_map, max_length):
#     label_ids = [label_map[label] if label in label_map else label_map['O'] for label in labels]
#     label_ids += [label_map['O']] * (max_length - len(labels))
#     return label_ids[:max_length]

# # Your label map
# label_map = {
#     'B-NAME_STUDENT': 0,
#     'I-NAME_STUDENT': 1,
#     'B-EMAIL': 2,
#     'I-EMAIL': 3,
#     'B-USERNAME': 4,
#     'I-USERNAME': 5,
#     'B-ID_NUM': 6,
#     'I-ID_NUM': 7,
#     'B-PHONE_NUM': 8,
#     'I-PHONE_NUM': 9,
#     'B-URL_PERSONAL': 10,
#     'I-URL_PERSONAL': 11,
#     'B-STREET_ADDRESS': 12,
#     'I-STREET_ADDRESS': 13,
#     'O': 14  # 'O' label for non-PII tokens
# }

# # Set num_labels to the number of different labels in your label map
# num_labels = len(label_map)

# # Define your tokenizer and model
# tokenizer = DebertaV2Tokenizer.from_pretrained('/kaggle/input/deberta-v3-small/transformers/v1/1/deberta-v3-small')

# # Split the training data into training and validation subsets
# train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

# # Preprocess the data
# max_length = 128

# # Create the training dataset
# train_tokens = [entry['tokens'] for entry in train_data]
# train_labels = [entry['labels'] for entry in train_data]
# train_encodings = encode_tokens(tokenizer, train_tokens, max_length=max_length)
# train_input_ids = train_encodings.input_ids
# train_attention_masks = train_encodings.attention_mask
# train_labels = [label_to_id(labels, label_map, max_length) for labels in train_labels]
# train_dataset = PIIDataset(train_input_ids, train_attention_masks, train_labels)

# # Create the test dataset
# test_tokens = [entry['tokens'] for entry in test_data]
# test_encodings = encode_tokens(tokenizer, test_tokens, max_length=max_length)
# test_input_ids = test_encodings.input_ids
# test_attention_masks = test_encodings.attention_mask
# test_labels = [label_to_id(['O']*len(entry['tokens']), label_map, max_length) for entry in test_data]
# test_dataset = PIIDataset(test_input_ids, test_attention_masks, test_labels)

# # Create the validation dataset
# val_tokens = [entry['tokens'] for entry in val_data]
# val_labels = [entry['labels'] for entry in val_data]
# val_encodings = encode_tokens(tokenizer, val_tokens, max_length=max_length)
# val_input_ids = val_encodings.input_ids
# val_attention_masks = val_encodings.attention_mask
# val_labels = [label_to_id(labels, label_map, max_length) for labels in val_labels]
# val_dataset = PIIDataset(val_input_ids, val_attention_masks, val_labels)

# # Create the DataLoaders
# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=16)
# val_loader = DataLoader(val_dataset, batch_size=16)

# Train the model

In [4]:
# from transformers import AdamW
# from sklearn.model_selection import train_test_split
# from tqdm import tqdm
# from transformers import DebertaV2ForTokenClassification
# from sklearn.model_selection import GridSearchCV
# from torch.utils.data import DataLoader
# from sklearn.metrics import fbeta_score

# # Initialize the model
# model = DebertaV2ForTokenClassification.from_pretrained(
#     '/kaggle/input/deberta-v3-small/transformers/v1/1/deberta-v3-small',
#     num_labels=num_labels,
#     ignore_mismatched_sizes=True
# )

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# def evaluate(model, val_loader, device):
#     model.eval()
#     true_labels, pred_labels = [], []
#     val_loss = 0.0
#     val_steps = 0
    
#     with torch.no_grad():
#         for batch in val_loader:
#             batch = {k: v.to(device) for k, v in batch.items()}
#             outputs = model(**batch)
#             logits = outputs.logits
#             label_ids = batch['labels']
#             batch_predictions = torch.argmax(logits, dim=-1)
            
#             val_loss += outputs.loss.item()
#             val_steps += 1
            
#             true_labels.extend(label_ids[label_ids != -100].cpu().numpy())
#             pred_labels.extend(batch_predictions[label_ids != -100].cpu().numpy())
    
#     val_loss /= val_steps
#     beta = 5.0
#     fbeta = fbeta_score(true_labels, pred_labels, beta=beta, average='micro')
#     return val_loss, fbeta

# # Define the training function
# def train_model(model, train_loader, val_loader, optimizer, epochs):
#     for epoch in range(epochs):
#         model.train()
#         total_loss = 0
#         progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", unit="batch")
        
#         for batch in progress_bar:
#             batch = {k: v.to(device) for k, v in batch.items()}
#             outputs = model(**batch)
#             loss = outputs.loss

#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#             total_loss += loss.item()
#             progress_bar.set_postfix({"Loss": loss.item()})

#         avg_loss = total_loss / len(train_loader)
        
#         val_loss, val_fbeta = evaluate(model, val_loader, device)
#         print(f"Epoch {epoch+1}/{epochs} - Average Loss: {avg_loss:.4f}, F-beta: {val_fbeta:.4f}")
    
#     return model

# # Grid search function
# def grid_search(model, train_dataset, val_dataset, param_grid):
#     def train_model(model, train_dataset, val_dataset, params):
#         train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
#         val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)
#         optimizer = AdamW(model.parameters(), lr=params['learning_rate'])
        
#         for epoch in range(params['epochs']):
#             model.train()
#             total_loss = 0
#             progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{params['epochs']}", unit="batch")
            
#             for batch in progress_bar:
#                 batch = {k: v.to(device) for k, v in batch.items()}
#                 outputs = model(**batch)
#                 loss = outputs.loss

#                 optimizer.zero_grad()
#                 loss.backward()
#                 optimizer.step()

#                 total_loss += loss.item()
#                 progress_bar.set_postfix({"Loss": loss.item()})

#             avg_loss = total_loss / len(train_loader)
            
#             val_loss, val_fbeta = evaluate(model, val_loader, device)
#             print(f"Epoch {epoch+1}/{params['epochs']} - Average Loss: {avg_loss:.4f}, F-beta: {val_fbeta:.4f}")
        
#         return model

#     def evaluate_model(model, val_dataset, batch_size):
#         val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
#         val_loss, fbeta = evaluate(model, val_loader, device)
#         return val_loss, fbeta
    
#     best_model = None
#     best_loss = float('inf')
#     best_fbeta = 0.0
    
#     for epochs in param_grid['epochs']:
#         for learning_rate in param_grid['learning_rate']:
#             for batch_size in param_grid['batch_size']:
#                 params = {
#                     'epochs': epochs,
#                     'learning_rate': learning_rate,
#                     'batch_size': batch_size
#                 }
#                 model_copy = model.to(device)
#                 trained_model = train_model(model_copy, train_dataset, val_dataset, params)
                
#                 val_loss, val_fbeta = evaluate_model(trained_model, val_dataset, batch_size)
                
#                 if val_loss < best_loss:
#                     best_loss = val_loss
#                     best_fbeta = val_fbeta
#                     best_model = trained_model
    
#     return best_model, best_loss, best_fbeta

# # Define the hyperparameter grid
# param_grid = {
#     'epochs': [3],
#     'learning_rate': [1e-3],
#     'batch_size': [32]
# }

# # Perform grid search
# best_model, best_loss, best_fbeta = grid_search(model, train_dataset, val_dataset, param_grid)
# print(f"Best model achieved validation loss: {best_loss:.4f} and F-beta score: {best_fbeta:.4f}")

# # Save the best model
# best_model.save_pretrained('model')
# tokenizer.save_pretrained('tokenizer')

In [5]:
# Define the hyperparameter grid
# param_grid = {
#     'epochs': [3, 5, 7],
#     'learning_rate': [1e-3, 1e-4, 1e-5],
#     'batch_size': [8, 16, 32]
# }


In [6]:
from transformers import AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import DebertaV2ForTokenClassification

# Train the model
model = DebertaV2ForTokenClassification.from_pretrained(
    '/kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large', 
    num_labels=num_labels, 
    ignore_mismatched_sizes=True
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 3  # Set the number of epochs
for epoch in range(epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", unit="batch")
    
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"Loss": loss.item()})

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Average Loss: {avg_loss:.4f}")

# After training, save the model if you need to use it later
model.save_pretrained('model')
tokenizer.save_pretrained('tokenizer')

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at /kaggle/input/debertav3-large/transformers/v1/1/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 627/627 [08:58<00:00,  1.17batch/s, Loss=0.00549]


Epoch 1/3 - Average Loss: 0.0372


Epoch 2/3: 100%|██████████| 627/627 [08:57<00:00,  1.17batch/s, Loss=0.00293]


Epoch 2/3 - Average Loss: 0.0187


Epoch 3/3: 100%|██████████| 627/627 [08:57<00:00,  1.17batch/s, Loss=0.0104]


Epoch 3/3 - Average Loss: 0.0145


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spm.model',
 'tokenizer/added_tokens.json')

# Predictions for Submission

In [7]:
reverse_label_map = {v: k for k, v in label_map.items()}

# Function to predict the labels using the model
def predict_labels(test_data, model, tokenizer, label_map, device):
    model.eval()  # Set the model to evaluation mode
    model.to(device)
    predictions = []
    row_id = 0
    
    with torch.no_grad():  # Disable gradient calculation
        for entry in test_data:
            tokens = entry['tokens']
            encoding = tokenizer(
                tokens,
                is_split_into_words=True,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=128,
                return_attention_mask=True
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions_batch = torch.argmax(logits, dim=-1).cpu().numpy()[0]  # Take the first batch
            
            valid_label_ids = set(reverse_label_map.keys())  # Get all valid label IDs

            for idx, pred in enumerate(predictions_batch):
                if pred in valid_label_ids:
                    # Exclude the 'O' label using its ID directly instead of looking up in label_map
                    if reverse_label_map[pred] != 'O':
                        predictions.append({
                            'row_id': row_id,
                            'document': entry['document'],
                            'token': idx,
                            'label': reverse_label_map[pred]
                        })
                        row_id += 1
    
    return predictions

# Load the best model
model = DebertaV2ForTokenClassification.from_pretrained('model')
tokenizer = DebertaV2Tokenizer.from_pretrained('tokenizer')
model.to(device)

# Predict labels for the test set
device = 'cuda' if torch.cuda.is_available() else 'cpu'
preds = predict_labels(test_data, model, tokenizer, reverse_label_map, device)

# Create a DataFrame for the predictions
pred_df = pd.DataFrame(preds)

# Ensure the order of the columns matches the required format
pred_df = pred_df[['row_id', 'document', 'token', 'label']]

# Save the predictions to a CSV file
pred_df.to_csv('submission.csv', index=False)

In [8]:
# # Function to predict the labels using the model
# def predict_labels(test_data, model, tokenizer, label_map, device):
#     model.eval()  # Set the model to evaluation mode
#     model.to(device)
    
#     predictions = []
#     row_id = 0
    
#     with torch.no_grad():  # Disable gradient calculation
#         for entry in test_data:
#             tokens = entry['tokens']
#             encoding = tokenizer(
#                 tokens,
#                 is_split_into_words=True,
#                 return_tensors="pt",
#                 padding=True,
#                 truncation=True,
#                 max_length=128,
#                 return_attention_mask=True
#             )
            
#             input_ids = encoding['input_ids'].to(device)
#             attention_mask = encoding['attention_mask'].to(device)
            
#             outputs = model(input_ids, attention_mask=attention_mask)
#             logits = outputs.logits
            
#             predictions_batch = torch.argmax(logits, dim=-1).cpu().numpy()[0]  # Take the first batch
            
#             valid_label_ids = set(label_map.values())  # Get all valid label IDs
#             print(f"Valid label IDs: {valid_label_ids}")
#             print(f"Predictions batch: {predictions_batch}")
            
#             for idx, pred in enumerate(predictions_batch):
#                 if pred in valid_label_ids:
#                     # Exclude the 'O' label using its ID directly instead of looking up in label_map
#                     if pred != label_map['O']:
#                         predictions.append({
#                             'row_id': row_id,
#                             'document': entry['document'],
#                             'token': tokens[idx],
#                             'label': list(label_map.keys())[list(label_map.values()).index(pred)]
#                         })
            
#             row_id += 1
    
#     return predictions

# # Load the best model
# model = DebertaV2ForTokenClassification.from_pretrained('model')
# tokenizer = DebertaV2Tokenizer.from_pretrained('tokenizer')
# model.to(device)

# print(f"Test data: {test_data}")
# print(f"Label map: {label_map}")

# # Predict labels for the test set
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# preds = predict_labels(test_data, model, tokenizer, label_map, device)

# if not preds:
#     raise ValueError("No predictions were generated. Please check the input data and model.")

# # Create a DataFrame for the predictions
# pred_df = pd.DataFrame(preds)

# # Check if the required columns exist in the DataFrame
# required_columns = ['document', 'token', 'row_id', 'label']
# missing_columns = set(required_columns) - set(pred_df.columns)

# if missing_columns:
#     raise KeyError(f"None of {list(missing_columns)} are in the {list(pred_df.columns)}")
# else:
#     # Ensure the order of the columns matches the required format
#     pred_df = pred_df[required_columns]
    
#     # Save the predictions to a CSV file
#     pred_df.to_csv('submission.csv', index=False)