In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import LongformerTokenizer, LongformerModel, AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from torch.utils.tensorboard import SummaryWriter

In [2]:
# Load the biomedical NER model
ner_tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
ner_model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [3]:
ner_tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
ner_model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple")

# Extract all unique entity types
all_entity_types = ner_model.config.id2label.values()

# Process entity types to remove 'B-' and 'I-' prefixes and create a unique set
unique_entity_types = set()
for entity_type in all_entity_types:
    if entity_type.startswith('B-') or entity_type.startswith('I-'):
        unique_entity_types.add(entity_type[2:])
    elif entity_type != 'O':  # Ignore 'O' (Outside) tag
        unique_entity_types.add(entity_type)

# Generate start and end tokens for each unique entity type
new_tokens = []
for entity_type in unique_entity_types:
    new_tokens.extend([f'\\s_{entity_type.lower()}', f'\\e_{entity_type.lower()}'])

# Add question start and end tokens
new_tokens.extend(['\\sq', '\\eq'])

print(f"Added {len(new_tokens)} new tokens:")
for token in new_tokens:
    print(token)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Added 88 new tokens:
\s_volume
\e_volume
\s_age
\e_age
\s_dosage
\e_dosage
\s_subject
\e_subject
\s_qualitative_concept
\e_qualitative_concept
\s_history
\e_history
\s_disease_disorder
\e_disease_disorder
\s_occupation
\e_occupation
\s_family_history
\e_family_history
\s_activity
\e_activity
\s_outcome
\e_outcome
\s_severity
\e_severity
\s_frequency
\e_frequency
\s_personal_[back](biological_structure
\e_personal_[back](biological_structure
\s_area
\e_area
\s_detailed_description
\e_detailed_description
\s_biological_attribute
\e_biological_attribute
\s_non[biological](detailed_description
\e_non[biological](detailed_description
\s_diagnostic_procedure
\e_diagnostic_procedure
\s_therapeutic_procedure
\e_therapeutic_procedure
\s_height
\e_height
\s_shape
\e_shape
\s_time
\e_time
\s_weight
\e_weight
\s_sign_symptom
\e_sign_symptom
\s_color
\e_color
\s_other_entity
\e_other_entity
\s_medication
\e_medication
\s_administration
\e_administration
\s_date
\e_date
\s_duration
\e_duration
\s_la

In [4]:


# Define the Longformer-based classifier
class LongformerClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super(LongformerClassifier, self).__init__()
        self.longformer = LongformerModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.05)
        self.fc = nn.Linear(self.longformer.config.hidden_size, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.longformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token representation
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        
        return logits, loss


In [5]:


# Define the dataset
class MCQDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data.iloc[idx]['processed_prompt']
        label = self.data.iloc[idx]['best_model']

        encoding = self.tokenizer.encode_plus(
            prompt,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [6]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import LongformerTokenizer, LongformerModel, AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch.utils.tensorboard import SummaryWriter
from torch.amp import autocast, GradScaler

In [7]:
import torch
from torch.optim import AdamW
from torch.amp import autocast, GradScaler
from transformers import LongformerTokenizer, get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from tqdm import tqdm
import warnings
import os
import numpy as np

# Suppress warnings
warnings.filterwarnings("ignore")

def training(experiment_name, lr, train_loader, test_loader, epochs=30):
    writer = SummaryWriter(experiment_name)
    
    # Initialize the tokenizer and model
    model_name = "allenai/longformer-base-4096"
    tokenizer = LongformerTokenizer.from_pretrained(model_name)
    num_added_tokens = tokenizer.add_tokens(new_tokens)
    print(f"Number of tokens added: {num_added_tokens}")
    
    num_classes = 3  # Define num_classes here
    model = LongformerClassifier(model_name, num_classes=num_classes)
    model.longformer.resize_token_embeddings(len(tokenizer))
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
    
    # Optimizer and scheduler setup
    num_training_steps = epochs * len(train_loader)
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.2)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=2, num_training_steps=num_training_steps)
    
    # Initialize the GradScaler for mixed precision training
    scaler = GradScaler('cuda')
    
    # Create directory for saving models
    os.makedirs(experiment_name, exist_ok=True)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        all_train_labels = []
        all_train_predictions = []
        
        # Use tqdm for progress bar
        train_iterator = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} [Train]", leave=False)
        
        for batch in train_iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            
            # Use autocast for mixed precision
            with autocast('cuda'):
                logits, loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            
            # Scale the loss and call backward()
            scaler.scale(loss).backward()
            
            # Unscale the gradients, clip them, and update the parameters
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            
            # Update the scaler
            scaler.update()
            
            scheduler.step()
            
            total_train_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            all_train_labels.extend(labels.cpu().numpy())
            all_train_predictions.extend(predictions.cpu().numpy())
            
            # Update tqdm progress bar
            train_iterator.set_postfix({'loss': f"{loss.item():.4f}"})
        
        # Calculate training metrics
        train_accuracy = accuracy_score(all_train_labels, all_train_predictions)
        train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(all_train_labels, all_train_predictions, average='weighted', zero_division=0)
        
        writer.add_scalar('Loss/train', total_train_loss / len(train_loader), epoch)
        writer.add_scalar('Accuracy/train', train_accuracy, epoch)
        writer.add_scalar('F1/train', train_f1, epoch)
        
        print(f"\nEpoch {epoch + 1}")
        print(f"Training - Loss: {total_train_loss / len(train_loader):.4f}, Accuracy: {train_accuracy:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}")
        
        # Evaluation on test set
        model.eval()
        total_test_loss = 0
        all_test_labels = []
        all_test_predictions = []
        
        test_iterator = tqdm(test_loader, desc=f"Epoch {epoch + 1}/{epochs} [Test]", leave=False)
        
        with torch.no_grad():
            for batch in test_iterator:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                # Use autocast for mixed precision
                with autocast('cuda'):
                    logits, loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                
                total_test_loss += loss.item()
                predictions = torch.argmax(logits, dim=-1)
                all_test_labels.extend(labels.cpu().numpy())
                all_test_predictions.extend(predictions.cpu().numpy())
                
                # Update tqdm progress bar
                test_iterator.set_postfix({'loss': f"{loss.item():.4f}"})
        
        # Calculate test metrics
        test_accuracy = accuracy_score(all_test_labels, all_test_predictions)
        test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(all_test_labels, all_test_predictions, average='weighted', zero_division=0)
        
        writer.add_scalar('Loss/test', total_test_loss / len(test_loader), epoch)
        writer.add_scalar('Accuracy/test', test_accuracy, epoch)
        writer.add_scalar('F1/test', test_f1, epoch)
        
        print(f"\nTest - Loss: {total_test_loss / len(test_loader):.4f}, Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1: {test_f1:.4f}")
        
        # Print per-class metrics for testing
        class_names = ['medalpaca', 'biomistral', 'meditron']
        test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
            all_test_labels, all_test_predictions, 
            average=None, zero_division=0, labels=range(num_classes)
        )
        
        for i, class_name in enumerate(class_names):
            print(f"  {class_name} - Precision: {test_precision[i]:.4f}, Recall: {test_recall[i]:.4f}, F1: {test_f1[i]:.4f}")
        
        # Save the model after each epoch
        torch.save(model.state_dict(), f'/home/ubuntu/nlp/models/model')
        print(f"Model saved: {experiment_name}/model_epoch_{epoch+1}.pt")

    print("Training completed.")

In [8]:

# Load and preprocess the data
df = pd.read_csv('mcq_data_with_custom_ner_tags_cleaned.csv')
df =df.sample(4000)
# Prepare the data
model_name = "allenai/longformer-base-4096"
tokenizer = LongformerTokenizer.from_pretrained(model_name)
max_length = 3400  # Longformer can handle up to 4096, adjust based on your needs

# Map string labels to integers
label_map = {'medalpaca': 0, 'biomistral': 1, 'meditron': 2}
df['best_model'] = df['best_model'].map(label_map)

# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create datasets
train_dataset = MCQDataset(train_df, tokenizer, max_length)
test_dataset = MCQDataset(test_df, tokenizer, max_length)

# Create data loaders
batch_size = 3
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [9]:
# Run training
learning_rates = [1e-5]  # You can adjust this list of learning rates
for lr in learning_rates:
    training(f'runs/longformer_biomedical_experiment_lr_{str(lr)}', lr, train_loader, test_loader)

Number of tokens added: 88
Using device: cuda
GPU: Tesla T4


Epoch 1/30 [Train]:   0%|          | 0/1067 [00:00<?, ?it/s]Input ids are automatically padded to be a multiple of `config.attention_window`: 512
                                                                                    


Epoch 1
Training - Loss: 1.0648, Accuracy: 0.4672, Precision: 0.3195, Recall: 0.4672, F1: 0.3199


                                                                                 


Test - Loss: 1.0881, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_1.pt


                                                                                    


Epoch 2
Training - Loss: 1.0640, Accuracy: 0.4709, Precision: 0.3098, Recall: 0.4709, F1: 0.3047


                                                                                 


Test - Loss: 1.0631, Accuracy: 0.3600, Precision: 0.3769, Recall: 0.3600, F1: 0.1975
  medalpaca - Precision: 0.3581, Recall: 0.9965, F1: 0.5269
  biomistral - Precision: 0.5714, Recall: 0.0115, F1: 0.0225
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_2.pt


                                                                                    


Epoch 3
Training - Loss: 1.0653, Accuracy: 0.4716, Precision: 0.5480, Recall: 0.4716, F1: 0.3095


                                                                                 


Test - Loss: 1.0720, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_3.pt


                                                                                    


Epoch 4
Training - Loss: 1.0667, Accuracy: 0.4713, Precision: 0.3303, Recall: 0.4713, F1: 0.3143


                                                                                 


Test - Loss: 1.0604, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_4.pt


                                                                                    


Epoch 5
Training - Loss: 1.0647, Accuracy: 0.4728, Precision: 0.3756, Recall: 0.4728, F1: 0.3044


                                                                                 


Test - Loss: 1.0792, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_5.pt


                                                                                    


Epoch 6
Training - Loss: 1.0648, Accuracy: 0.4722, Precision: 0.3753, Recall: 0.4722, F1: 0.3034


                                                                                 


Test - Loss: 1.0634, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_6.pt


                                                                                    


Epoch 7
Training - Loss: 1.0634, Accuracy: 0.4722, Precision: 0.2230, Recall: 0.4722, F1: 0.3029


                                                                                 


Test - Loss: 1.0617, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_7.pt


                                                                                    


Epoch 8
Training - Loss: 1.0635, Accuracy: 0.4722, Precision: 0.2230, Recall: 0.4722, F1: 0.3029


                                                                                 


Test - Loss: 1.0574, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_8.pt


                                                                                    


Epoch 9
Training - Loss: 1.0571, Accuracy: 0.4722, Precision: 0.2230, Recall: 0.4722, F1: 0.3029


                                                                                 


Test - Loss: 1.0769, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_9.pt


                                                                                     


Epoch 10
Training - Loss: 1.0627, Accuracy: 0.4722, Precision: 0.2230, Recall: 0.4722, F1: 0.3029


                                                                                  


Test - Loss: 1.0647, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_10.pt


                                                                                     


Epoch 11
Training - Loss: 1.0604, Accuracy: 0.4722, Precision: 0.2230, Recall: 0.4722, F1: 0.3029


                                                                                  


Test - Loss: 1.0585, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_11.pt


                                                                                     


Epoch 12
Training - Loss: 1.0623, Accuracy: 0.4722, Precision: 0.2230, Recall: 0.4722, F1: 0.3029


                                                                                  


Test - Loss: 1.0605, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_12.pt


                                                                                     


Epoch 13
Training - Loss: 1.0650, Accuracy: 0.4722, Precision: 0.2230, Recall: 0.4722, F1: 0.3029


                                                                                  


Test - Loss: 1.0634, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_13.pt


                                                                                     


Epoch 14
Training - Loss: 1.0601, Accuracy: 0.4722, Precision: 0.2230, Recall: 0.4722, F1: 0.3029


                                                                                  


Test - Loss: 1.0599, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_14.pt


                                                                                     


Epoch 15
Training - Loss: 1.0624, Accuracy: 0.4722, Precision: 0.2230, Recall: 0.4722, F1: 0.3029


                                                                                  


Test - Loss: 1.0632, Accuracy: 0.4363, Precision: 0.1903, Recall: 0.4363, F1: 0.2650
  medalpaca - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
  biomistral - Precision: 0.4363, Recall: 1.0000, F1: 0.6075
  meditron - Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Model saved: runs/longformer_biomedical_experiment_lr_1e-05/model_epoch_15.pt


Epoch 16/30 [Train]: 100%|█████████▉| 1066/1067 [36:37<00:02,  2.06s/it, loss=1.2821]

In [None]:
!nvidia-smi

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import LongformerTokenizer
import pandas as pd
from tqdm import tqdm

# Assuming you have already defined LongformerClassifier and MCQDataset classes

def generate_predictions_csv(model_path, data_csv_path, output_csv_path, batch_size=8):
    # Load the trained model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LongformerClassifier("allenai/longformer-base-4096", num_classes=3)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    # Load and prepare the data
    df = pd.read_csv(data_csv_path)
    tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
    max_length = 3400

    # Add new tokens if they were used during training
    tokenizer.add_tokens(new_tokens)  # Assuming new_tokens is defined in the global scope

    # Create dataset and dataloader
    dataset = MCQDataset(df, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # Make predictions
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            with torch.cuda.amp.autocast():
                logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)
            
            predictions = torch.argmax(logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())

    # Map predictions back to model names
    label_map = {0: 'medalpaca', 1: 'biomistral', 2: 'meditron'}
    predicted_models = [label_map[pred] for pred in all_predictions]

    # Create a new dataframe with prompts and predictions
    output_df = pd.DataFrame({
        'prompt': df['processed_prompt'],
        'predicted_best_model': predicted_models
    })

    # Save to CSV
    output_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

# Usage
model_path = r'/home/ubuntu/nlp/models/model'  # Replace with your model's path
data_csv_path = 'mcq_data_with_custom_ner_tags_cleaned.csv'  # Your input CSV
output_csv_path = 'prompts_with_predictions.csv'  # Name of the output CSV

generate_predictions_csv(model_path, data_csv_path, output_csv_path)

RuntimeError: Error(s) in loading state_dict for LongformerClassifier:
	size mismatch for longformer.embeddings.word_embeddings.weight: copying a param with shape torch.Size([50353, 768]) from checkpoint, the shape in current model is torch.Size([50265, 768]).

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import LongformerTokenizer
import pandas as pd
from tqdm import tqdm

# Assuming you have already defined LongformerClassifier and MCQDataset classes

def generate_predictions_csv(model_path, data_csv_path, output_csv_path, batch_size=8):
    # Initialize tokenizer and add custom tokens
    tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
    
    # Define new_tokens (make sure this matches what you used during training)
    new_tokens = ['\sq', '\eq'] + [f'\\s_{entity.lower()}' for entity in ['GENE', 'DISEASE', 'CHEMICAL', 'SPECIES', 'CELL_LINE', 'DNA', 'RNA', 'CELL_TYPE', 'PROTEIN']] + [f'\\e_{entity.lower()}' for entity in ['GENE', 'DISEASE', 'CHEMICAL', 'SPECIES', 'CELL_LINE', 'DNA', 'RNA', 'CELL_TYPE', 'PROTEIN']]
    
    num_added_tokens = tokenizer.add_tokens(new_tokens)
    print(f"Number of tokens added: {num_added_tokens}")

    # Load the trained model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LongformerClassifier("allenai/longformer-base-4096", num_classes=3)
    
    # Resize token embeddings before loading state dict
    model.longformer.resize_token_embeddings(len(tokenizer))
    
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    # Load and prepare the data
    df = pd.read_csv(data_csv_path)
    max_length = 3400

    # Create dataset and dataloader
    dataset = MCQDataset(df, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # Make predictions
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            with torch.cuda.amp.autocast():
                logits, _ = model(input_ids=input_ids, attention_mask=attention_mask)
            
            predictions = torch.argmax(logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())

    # Map predictions back to model names
    label_map = {0: 'medalpaca', 1: 'biomistral', 2: 'meditron'}
    predicted_models = [label_map[pred] for pred in all_predictions]

    # Create a new dataframe with prompts and predictions
    output_df = pd.DataFrame({
        'prompt': df['processed_prompt'],
        'predicted_best_model': predicted_models
    })

    # Save to CSV
    output_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

# Usage
model_path = r'/home/ubuntu/nlp/model.pth'  # Replace with your model's path
data_csv_path = 'mcq_data_with_custom_ner_tags_cleaned.csv'  # Your input CSV
output_csv_path = 'prompts_with_predictions.csv'  # Name of the output CSV

generate_predictions_csv(model_path, data_csv_path, output_csv_path)

Number of tokens added: 20


RuntimeError: Error(s) in loading state_dict for LongformerClassifier:
	size mismatch for longformer.embeddings.word_embeddings.weight: copying a param with shape torch.Size([50353, 768]) from checkpoint, the shape in current model is torch.Size([50285, 768]).

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import LongformerTokenizer, LongformerModel
import pandas as pd
from tqdm import tqdm

# Assuming you have already defined MCQDataset class

class LongformerClassifier(torch.nn.Module):
    def __init__(self, model_name, num_classes):
        super(LongformerClassifier, self).__init__()
        self.longformer = LongformerModel.from_pretrained(model_name)
        self.classifier = torch.nn.Linear(self.longformer.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.longformer(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(pooled_output)
        return logits

def resize_embeddings(model, new_num_tokens):
    old_embeddings = model.longformer.embeddings.word_embeddings
    new_embeddings = torch.nn.Embedding(new_num_tokens, old_embeddings.embedding_dim)
    new_embeddings.to(old_embeddings.weight.device)

    # Copy the original embeddings
    num_tokens_to_copy = min(old_embeddings.num_embeddings, new_num_tokens)
    new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]

    model.longformer.embeddings.word_embeddings = new_embeddings
    model.longformer.config.vocab_size = new_num_tokens

def generate_predictions_csv(model_path, data_csv_path, output_csv_path, batch_size=8):
    # Initialize tokenizer and add custom tokens
    tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
    
    # Define new_tokens (make sure this matches what you used during training)
    new_tokens = ['\sq', '\eq'] + [f'\\s_{entity.lower()}' for entity in ['GENE', 'DISEASE', 'CHEMICAL', 'SPECIES', 'CELL_LINE', 'DNA', 'RNA', 'CELL_TYPE', 'PROTEIN']] + [f'\\e_{entity.lower()}' for entity in ['GENE', 'DISEASE', 'CHEMICAL', 'SPECIES', 'CELL_LINE', 'DNA', 'RNA', 'CELL_TYPE', 'PROTEIN']]
    
    num_added_tokens = tokenizer.add_tokens(new_tokens)
    print(f"Number of tokens added: {num_added_tokens}")

    # Load the trained model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LongformerClassifier("allenai/longformer-base-4096", num_classes=3)
    
    # Load state dict
    state_dict = torch.load(model_path, map_location=device)
    
    # Get the vocabulary size from the saved model
    saved_vocab_size = state_dict['longformer.embeddings.word_embeddings.weight'].size(0)
    
    # Resize model embeddings if necessary
    if saved_vocab_size != len(tokenizer):
        print(f"Resizing model embeddings from {len(tokenizer)} to {saved_vocab_size}")
        resize_embeddings(model, saved_vocab_size)
    
    # Load the state dict
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    # Load and prepare the data
    df = pd.read_csv(data_csv_path)
    max_length = 3400

    # Create dataset and dataloader
    dataset = MCQDataset(df, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # Make predictions
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            
            predictions = torch.argmax(logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())

    # Map predictions back to model names
    label_map = {0: 'medalpaca', 1: 'biomistral', 2: 'meditron'}
    predicted_models = [label_map[pred] for pred in all_predictions]

    # Create a new dataframe with prompts and predictions
    output_df = pd.DataFrame({
        'prompt': df['processed_prompt'],
        'predicted_best_model': predicted_models
    })

    # Save to CSV
    output_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

# Usage
model_path = r'/home/ubuntu/nlp/model.pth'  # Replace with your model's path
data_csv_path = 'mcq_data_with_custom_ner_tags_cleaned.csv'  # Your input CSV
output_csv_path = 'prompts_with_predictions.csv'  # Name of the output CSV

generate_predictions_csv(model_path, data_csv_path, output_csv_path)

Number of tokens added: 20
Resizing model embeddings from 50285 to 50353


RuntimeError: Error(s) in loading state_dict for LongformerClassifier:
	Missing key(s) in state_dict: "classifier.weight", "classifier.bias". 
	Unexpected key(s) in state_dict: "fc.weight", "fc.bias". 

In [None]:
print(len(tokenizer))

50265


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerTokenizer, LongformerModel
import pandas as pd
from tqdm import tqdm

class MCQDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_map = {'medalpaca': 0, 'biomistral': 1, 'meditron': 2}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data.iloc[idx]['processed_prompt']
        label = self.data.iloc[idx]['best_model']

        encoding = self.tokenizer.encode_plus(
            prompt,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.label_map[label], dtype=torch.long)
        }

# [The LongformerClassifier class remains the same as in the previous response]

def generate_predictions_csv(model_path, data_csv_path, output_csv_path, batch_size=8):
    # Initialize tokenizer and add custom tokens
    tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
    
    # Define new_tokens (make sure this matches exactly what you used during training)
    new_tokens = ['\sq', '\eq'] + [f'\\s_{entity.lower()}' for entity in ['GENE', 'DISEASE', 'CHEMICAL', 'SPECIES', 'CELL_LINE', 'DNA', 'RNA', 'CELL_TYPE', 'PROTEIN']] + [f'\\e_{entity.lower()}' for entity in ['GENE', 'DISEASE', 'CHEMICAL', 'SPECIES', 'CELL_LINE', 'DNA', 'RNA', 'CELL_TYPE', 'PROTEIN']]
    
    num_added_tokens = tokenizer.add_tokens(new_tokens)
    print(f"Number of tokens added: {num_added_tokens}")
    print(f"Tokenizer vocabulary size after adding tokens: {len(tokenizer)}")

    # Load the trained model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LongformerClassifier("allenai/longformer-base-4096", num_classes=3)
    
    # Load state dict
    state_dict = torch.load(model_path, map_location=device)
    
    # Get the vocabulary size from the saved model
    saved_vocab_size = state_dict['longformer.embeddings.word_embeddings.weight'].size(0)
    print(f"Saved model vocabulary size: {saved_vocab_size}")
    
    # Resize model embeddings if necessary
    if saved_vocab_size != len(tokenizer):
        print(f"Resizing model embeddings from {len(tokenizer)} to {saved_vocab_size}")
        model.longformer.resize_token_embeddings(saved_vocab_size)
    
    # Load the state dict
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    # Load and prepare the data
    df = pd.read_csv(data_csv_path)
    max_length = 3400

    # Create dataset and dataloader
    dataset = MCQDataset(df, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # Make predictions
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            
            predictions = torch.argmax(logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())

    # Map predictions back to model names
    label_map = {0: 'medalpaca', 1: 'biomistral', 2: 'meditron'}
    predicted_models = [label_map[pred] for pred in all_predictions]

    # Create a new dataframe with prompts and predictions
    output_df = pd.DataFrame({
        'prompt': df['processed_prompt'],
        'predicted_best_model': predicted_models
    })

    # Save to CSV
    output_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

# Usage
model_path = r'/home/ubuntu/nlp/model.pth'  # Replace with your model's path
data_csv_path = 'mcq_data_with_custom_ner_tags_cleaned.csv'  # Your input CSV
output_csv_path = 'prompts_with_predictions.csv'  # Name of the output CSV

generate_predictions_csv(model_path, data_csv_path, output_csv_path)

Number of tokens added: 20
Tokenizer vocabulary size after adding tokens: 50285
Saved model vocabulary size: 50353
Resizing model embeddings from 50285 to 50353


Generating predictions: 100%|██████████| 1255/1255 [52:17<00:00,  2.50s/it]


Predictions saved to prompts_with_predictions.csv
