In [12]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2TokenizerFast, AdamW, get_linear_schedule_with_warmup, AutoModelForTokenClassification
from torch.nn import CrossEntropyLoss
from torch.nn.utils.rnn import pad_sequence

Step 2: Load Datasets

In [13]:
# Load datasets
train_data = pd.read_csv("train.csv")
dev_data = pd.read_csv("dev.csv")
test_data = pd.read_csv("test.csv")

# Dataset sizes
print("Train size:", train_data.shape)
print("Dev size:", dev_data.shape)
print("Test size:", test_data.shape)

Train size: (28931, 3)
Dev size: (3460, 3)
Test size: (4143, 3)


Step 3: Combine Data for Global Analysis and Visualize Tag Distribution

In [14]:
# Combine for global analysis
all_data = pd.concat([train_data.assign(split="train"),
                      dev_data.assign(split="dev"),
                      test_data.assign(split="test")])

# Tag distribution per split
tag_distribution = all_data.groupby('split')['tag'].value_counts().unstack().fillna(0)
print("Tag Distribution Across Splits:\n", tag_distribution)

Tag Distribution Across Splits:
 tag       B     I      O
split                   
dev     409   405   2646
test    451   488   3204
train  3440  3562  21929


Step 4: Load Tokenizer (GPT-2 with add_prefix_space=True)

In [15]:
# Load tokenizer with add_prefix_space=True for pre-tokenized inputs
model_name = "distilgpt2"
tokenizer = GPT2TokenizerFast.from_pretrained(model_name, add_prefix_space=True)

Step 5: Preprocess Tags (Convert 'b' to 'B')

In [16]:
# Map tags to indices
tag2id = {"O": 0, "B": 1, "I": 2}
id2tag = {v: k for k, v in tag2id.items()}

# Function to preprocess tags and convert 'b' to 'B'
def preprocess_tags(df):
    df['tag'] = df['tag'].apply(lambda x: 'B' if x == 'b' else x)
    return df

# Preprocess the tags before tokenization
train_data = preprocess_tags(train_data)
dev_data = preprocess_tags(dev_data)
test_data = preprocess_tags(test_data)

Step 6: Tokenize and Align Labels

In [17]:
# Function to tokenize and align labels
def tokenize_and_align_labels(df):
    tokenized_data = {"input_ids": [], "attention_mask": [], "labels": []}

    # Group by abstract_id to process abstracts as units
    grouped = df.groupby("abstract_id")

    for abstract_id, group in grouped:
        words = group["word"].tolist()
        tags = group["tag"].tolist()

        # Tokenize words with pre-tokenized inputs (add prefix space)
        tokenized = tokenizer(words, truncation=True, is_split_into_words=True)

        # Align labels
        labels = []
        word_ids = tokenized.word_ids()
        for word_idx in word_ids:
            if word_idx is None:
                labels.append(-100)  # Special tokens
            else:
                tag = tags[word_idx]
                labels.append(tag2id.get(tag, -100))  # Default to -100 for unexpected tags

        tokenized_data["input_ids"].append(tokenized["input_ids"])
        tokenized_data["attention_mask"].append(tokenized["attention_mask"])
        tokenized_data["labels"].append(labels)

    return tokenized_data

# Process datasets
try:
    train_tokenized = tokenize_and_align_labels(train_data)
    dev_tokenized = tokenize_and_align_labels(dev_data)
    test_tokenized = tokenize_and_align_labels(test_data)
    print("Tokenization completed successfully!")
except Exception as e:
    print("An error occurred during tokenization:", str(e))

Tokenization completed successfully!


Step 7: Create a Custom Dataset Class

In [18]:
# Create a custom Dataset class to handle tokenized data
class NERDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_masks = tokenized_data['attention_mask']
        self.labels = tokenized_data['labels']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_masks[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Convert the tokenized data into Dataset objects
train_dataset = NERDataset(train_tokenized)
dev_dataset = NERDataset(dev_tokenized)
test_dataset = NERDataset(test_tokenized)

Step 8: Load Pre-trained Model

In [19]:
# Load the pre-trained model (DistilGPT-2)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=3, ignore_mismatched_sizes=True)

# Set the model to train mode
model.train()

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForTokenClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=3, bias=True)
)

Step 9: Create Custom Collate Function to Pad Sequences
python


In [20]:
# Create a custom collate function to pad sequences
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Ensure tokenizer's pad_token_id is not None
    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
    
    # Pad sequences
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Pad labels with -100

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

# Create DataLoader for training and validation sets using the custom collate function
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
dev_dataloader = DataLoader(dev_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

Step 10: Set Up Loss Function, Optimizer, and Scheduler

In [21]:
# Class weights (adjust based on your dataset)
class_weights = torch.tensor([1.0, 10.0, 10.0])  # 'O' : 1, 'B' : 10, 'I' : 10

# Loss function with class weights
loss_fn = CrossEntropyLoss(weight=class_weights)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Scheduler
total_steps = len(train_dataloader) * 50  
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



Step 11: Training Loop

In [22]:
# Training loop
epochs = 50

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}/{epochs}"):
        optimizer.zero_grad()

        # Forward pass
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        loss = loss_fn(logits.view(-1, model.config.num_labels), labels.view(-1))
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Average Training Loss: {avg_train_loss:.4f}")


Training Epoch 1/50: 100%|██████████| 18/18 [01:35<00:00,  5.28s/it]


Epoch 1 - Average Training Loss: 0.9232


Training Epoch 2/50: 100%|██████████| 18/18 [01:31<00:00,  5.09s/it]


Epoch 2 - Average Training Loss: 0.5943


Training Epoch 3/50: 100%|██████████| 18/18 [01:32<00:00,  5.13s/it]


Epoch 3 - Average Training Loss: 0.4559


Training Epoch 4/50: 100%|██████████| 18/18 [01:34<00:00,  5.26s/it]


Epoch 4 - Average Training Loss: 0.4025


Training Epoch 5/50: 100%|██████████| 18/18 [01:35<00:00,  5.30s/it]


Epoch 5 - Average Training Loss: 0.3606


Training Epoch 6/50: 100%|██████████| 18/18 [01:37<00:00,  5.40s/it]


Epoch 6 - Average Training Loss: 0.3419


Training Epoch 7/50: 100%|██████████| 18/18 [01:32<00:00,  5.14s/it]


Epoch 7 - Average Training Loss: 0.3194


Training Epoch 8/50: 100%|██████████| 18/18 [01:32<00:00,  5.14s/it]


Epoch 8 - Average Training Loss: 0.2888


Training Epoch 9/50: 100%|██████████| 18/18 [01:33<00:00,  5.17s/it]


Epoch 9 - Average Training Loss: 0.2733


Training Epoch 10/50: 100%|██████████| 18/18 [01:31<00:00,  5.07s/it]


Epoch 10 - Average Training Loss: 0.2535


Training Epoch 11/50: 100%|██████████| 18/18 [01:29<00:00,  4.99s/it]


Epoch 11 - Average Training Loss: 0.2358


Training Epoch 12/50: 100%|██████████| 18/18 [01:32<00:00,  5.14s/it]


Epoch 12 - Average Training Loss: 0.2146


Training Epoch 13/50: 100%|██████████| 18/18 [01:32<00:00,  5.14s/it]


Epoch 13 - Average Training Loss: 0.2073


Training Epoch 14/50: 100%|██████████| 18/18 [01:36<00:00,  5.35s/it]


Epoch 14 - Average Training Loss: 0.1870


Training Epoch 15/50: 100%|██████████| 18/18 [01:32<00:00,  5.14s/it]


Epoch 15 - Average Training Loss: 0.1773


Training Epoch 16/50: 100%|██████████| 18/18 [01:30<00:00,  5.02s/it]


Epoch 16 - Average Training Loss: 0.1711


Training Epoch 17/50: 100%|██████████| 18/18 [01:29<00:00,  4.95s/it]


Epoch 17 - Average Training Loss: 0.1574


Training Epoch 18/50: 100%|██████████| 18/18 [01:30<00:00,  5.06s/it]


Epoch 18 - Average Training Loss: 0.1498


Training Epoch 19/50: 100%|██████████| 18/18 [01:31<00:00,  5.08s/it]


Epoch 19 - Average Training Loss: 0.1432


Training Epoch 20/50: 100%|██████████| 18/18 [01:35<00:00,  5.33s/it]


Epoch 20 - Average Training Loss: 0.1411


Training Epoch 21/50: 100%|██████████| 18/18 [01:33<00:00,  5.20s/it]


Epoch 21 - Average Training Loss: 0.1310


Training Epoch 22/50: 100%|██████████| 18/18 [01:30<00:00,  5.01s/it]


Epoch 22 - Average Training Loss: 0.1271


Training Epoch 23/50: 100%|██████████| 18/18 [01:29<00:00,  5.00s/it]


Epoch 23 - Average Training Loss: 0.1216


Training Epoch 24/50: 100%|██████████| 18/18 [01:32<00:00,  5.13s/it]


Epoch 24 - Average Training Loss: 0.1130


Training Epoch 25/50: 100%|██████████| 18/18 [01:30<00:00,  5.05s/it]


Epoch 25 - Average Training Loss: 0.1084


Training Epoch 26/50: 100%|██████████| 18/18 [01:31<00:00,  5.09s/it]


Epoch 26 - Average Training Loss: 0.1088


Training Epoch 27/50: 100%|██████████| 18/18 [01:34<00:00,  5.24s/it]


Epoch 27 - Average Training Loss: 0.1070


Training Epoch 28/50: 100%|██████████| 18/18 [01:35<00:00,  5.31s/it]


Epoch 28 - Average Training Loss: 0.0973


Training Epoch 29/50: 100%|██████████| 18/18 [01:32<00:00,  5.16s/it]


Epoch 29 - Average Training Loss: 0.0953


Training Epoch 30/50: 100%|██████████| 18/18 [01:30<00:00,  5.02s/it]


Epoch 30 - Average Training Loss: 0.0951


Training Epoch 31/50: 100%|██████████| 18/18 [01:32<00:00,  5.14s/it]


Epoch 31 - Average Training Loss: 0.0904


Training Epoch 32/50: 100%|██████████| 18/18 [01:30<00:00,  5.02s/it]


Epoch 32 - Average Training Loss: 0.0908


Training Epoch 33/50: 100%|██████████| 18/18 [01:32<00:00,  5.11s/it]


Epoch 33 - Average Training Loss: 0.0871


Training Epoch 34/50: 100%|██████████| 18/18 [01:35<00:00,  5.31s/it]


Epoch 34 - Average Training Loss: 0.0861


Training Epoch 35/50: 100%|██████████| 18/18 [01:30<00:00,  5.05s/it]


Epoch 35 - Average Training Loss: 0.0825


Training Epoch 36/50: 100%|██████████| 18/18 [01:32<00:00,  5.13s/it]


Epoch 36 - Average Training Loss: 0.0824


Training Epoch 37/50: 100%|██████████| 18/18 [01:31<00:00,  5.08s/it]


Epoch 37 - Average Training Loss: 0.0787


Training Epoch 38/50: 100%|██████████| 18/18 [01:32<00:00,  5.14s/it]


Epoch 38 - Average Training Loss: 0.0780


Training Epoch 39/50: 100%|██████████| 18/18 [01:31<00:00,  5.11s/it]


Epoch 39 - Average Training Loss: 0.0746


Training Epoch 40/50: 100%|██████████| 18/18 [01:29<00:00,  4.97s/it]


Epoch 40 - Average Training Loss: 0.0759


Training Epoch 41/50: 100%|██████████| 18/18 [01:32<00:00,  5.12s/it]


Epoch 41 - Average Training Loss: 0.0733


Training Epoch 42/50: 100%|██████████| 18/18 [01:28<00:00,  4.91s/it]


Epoch 42 - Average Training Loss: 0.0733


Training Epoch 43/50: 100%|██████████| 18/18 [01:31<00:00,  5.10s/it]


Epoch 43 - Average Training Loss: 0.0723


Training Epoch 44/50: 100%|██████████| 18/18 [01:28<00:00,  4.93s/it]


Epoch 44 - Average Training Loss: 0.0697


Training Epoch 45/50: 100%|██████████| 18/18 [01:29<00:00,  4.98s/it]


Epoch 45 - Average Training Loss: 0.0693


Training Epoch 46/50: 100%|██████████| 18/18 [01:28<00:00,  4.89s/it]


Epoch 46 - Average Training Loss: 0.0711


Training Epoch 47/50: 100%|██████████| 18/18 [01:28<00:00,  4.94s/it]


Epoch 47 - Average Training Loss: 0.0676


Training Epoch 48/50: 100%|██████████| 18/18 [01:32<00:00,  5.17s/it]


Epoch 48 - Average Training Loss: 0.0703


Training Epoch 49/50: 100%|██████████| 18/18 [01:34<00:00,  5.23s/it]


Epoch 49 - Average Training Loss: 0.0676


Training Epoch 50/50: 100%|██████████| 18/18 [01:31<00:00,  5.06s/it]

Epoch 50 - Average Training Loss: 0.0686





Step 12: Evaluation Function

In [23]:
# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        # Collect predictions
        preds = outputs.logits.argmax(dim=-1)
        all_preds.extend(preds.cpu().numpy().flatten())
        all_labels.extend(labels.cpu().numpy().flatten())

    # Flatten the lists
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Mask out padding tokens
    mask = all_labels != -100
    all_preds = all_preds[mask]
    all_labels = all_labels[mask]

    # Print classification report
    print(classification_report(all_labels, all_preds, target_names=list(id2tag.values())))

    # Calculate and print accuracy
    accuracy = np.sum(all_preds == all_labels) / len(all_labels)
    print(f"Accuracy: {accuracy:.4f}")

# Evaluate the model on the dev set
evaluate(model, dev_dataloader)

Evaluating: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]

              precision    recall  f1-score   support

           O       0.95      0.85      0.90      2747
           B       0.61      0.81      0.69       604
           I       0.65      0.78      0.71       496

    accuracy                           0.83      3847
   macro avg       0.74      0.81      0.77      3847
weighted avg       0.86      0.83      0.84      3847

Accuracy: 0.8347





tep 13: Save the Model and Tokenizer

In [24]:
# Save the model and tokenizer
model.save_pretrained("/path/to/save/model")
tokenizer.save_pretrained("/path/to/save/tokenizer") 

('/path/to/save/tokenizer\\tokenizer_config.json',
 '/path/to/save/tokenizer\\special_tokens_map.json',
 '/path/to/save/tokenizer\\vocab.json',
 '/path/to/save/tokenizer\\merges.txt',
 '/path/to/save/tokenizer\\added_tokens.json',
 '/path/to/save/tokenizer\\tokenizer.json')

TESTING WITH REAL ABSTRACT

In [26]:
# Function to predict entities and align tokens with their tags
def predict_entities(sentence):
    # Ensure the tokenizer has a pad_token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use the end-of-sequence token as padding
    
    # Tokenize the sentence and get the predicted entity tags
    tokenized_input = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, is_split_into_words=False)
    input_ids = tokenized_input["input_ids"].to(model.device)
    attention_mask = tokenized_input["attention_mask"].to(model.device)
    
    # Get model outputs
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().numpy().flatten()

    # Align tokens with predicted labels
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())
    predicted_tags = [id2tag[pred] for pred in predictions]

    # Merge subwords with the same entity tag
    result = []
    previous_tag = None
    current_word = []
    
    for token, tag in zip(tokens, predicted_tags):
        clean_token = token.lstrip("Ġ")  # Remove leading 'Ġ' character
        
        # If it's a subword token (part of a larger word), we append it to the current word
        if token.startswith("Ġ"):
            if current_word:
                result.append(("".join(current_word), previous_tag))
            current_word = [clean_token]
            previous_tag = tag
        else:
            current_word.append(clean_token)

    if current_word:
        result.append(("".join(current_word), previous_tag))  # Add the last word

    return result

# Example sentence to test the model
sentence = "NLP bridges the gap between human communication and computer understanding by analyzing and interpreting language data. Sentiment analysis, a popular application of NLP, helps companies understand customer opinions from reviews and social media posts."

# Get predicted entities for the sentence
predicted_entities = predict_entities(sentence)

# Print the results without 'Ġ'
for token, tag in predicted_entities:
    print(f"{token}: {tag}")


NLP: B
bridges: I
the: O
gap: O
between: O
human: B
communication: I
and: O
computer: B
understanding: I
by: O
analyzing: O
and: O
interpreting: O
language: O
data.: O
Sentiment: B
analysis,: I
a: O
popular: O
application: O
of: O
NLP,: B
helps: O
companies: O
understand: O
customer: B
opinions: O
from: O
reviews: O
and: O
social: B
media: O
posts.: O


In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import pickle

# Define id2tag and tag2id (adjust these based on your dataset)
id2tag = {0: "O", 1: "B", 2: "I"}  # Example mapping
tag2id = {v: k for k, v in id2tag.items()}

# Load the tokenizer and model architecture
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForTokenClassification.from_pretrained("distilgpt2", num_labels=len(id2tag))

# Then proceed to save the model and tokenizer
model_data = {
    "model_state_dict": model.state_dict(),
    "tokenizer": tokenizer,
    "id2tag": id2tag,
    "tag2id": tag2id
}

# Save to a .pkl file
with open("distilgpt2.pkl", "wb") as f:
    pickle.dump(model_data, f)

print("Model and tokenizer saved as 'distilgpt2.pkl'.")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer saved as 'distilgpt2.pkl'.
