## NOTES:
[GPT-2 Classification Getting Started](https://www.kaggle.com/code/andres6garzon/getting-started-nlp-classification-using-gpt-2)

[GPT-2 (Medium)](https://huggingface.co/openai-community/gpt2-medium)

In [1]:
import pandas as pd
import time
from datasets import Dataset, load_dataset
import numpy as np
import torch
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, classification_report
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)
import re

Using TensorFlow backend.





In [2]:
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')

In [3]:
truncate_length = 512

def preprocess(texts, labels=None):
    # Create dataset object
    output = []
    
    for i in tqdm(range(len(texts))):
        
        text = texts[i]
        
        # remove entirety of html lists
        # text = re.sub(r'<ol>[.\s\S]*?<\/ol>', '', text)
        
        # remove html tags
        text = re.sub(r'<.*?>', '', text)
        
        # remove bullets
        text = re.sub(r'\s\*', '', text)
        
        # remove bulleted lines
        #text = re.sub(r'\n\s*\*.*', '', text)
        
        # remove curly braced sections
        text = re.sub(r'\{.*?\}', '', text)
        
        # remove // comments
        text = re.sub(r'\s*\/\/.*', '', text)
        
        # remove formatting for /* */ comments
        text = re.sub(r'\/\*.|\*\/', '', text)
        
        # remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        
        # truncate
        if (len(text) > truncate_length):
            text = text[:(int(truncate_length/2)-4)] + ' ... ' + text[-(int(truncate_length/2)-4):]
        
        if labels is not None:
            output.append({
                'text': text,
                'label': labels[i]
            })
        else:
            output.append({
                'text': text,
                'label': 0
            })
    
    return output

## Getting Started

In [4]:
# Params
seed = 42
batch_size = 8
max_length = 512
num_labels = 7
num_epochs = 1

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [6]:
class GPT2_collator(object):
    def __init__(self, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [sequence['label'] for sequence in sequences]
        
        inputs = self.tokenizer(text=texts, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        inputs.update({'labels': torch.tensor(labels)})
        
        return inputs
# =================================================================================================================================
def train(dataloader, optimizer, scheduler, device, max_batches=None):
    global model
    model.train()
    predictions = []
    true_labels = []
    total_loss = 0
    batch_count = 0
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        batch_true_labels = batch['labels'].numpy().flatten().tolist()
        batch = {k:v.to(device) for k,v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        batch_predictions = logits.argmax(axis=-1).flatten().tolist()
        
        predictions += batch_predictions
        true_labels += batch_true_labels
        
        batch_count += 1
        if max_batches and batch_count >= max_batches:
            break
    
    avg_epoch_loss = total_loss / batch_count
    
    return predictions, true_labels, avg_epoch_loss

def validate(dataloader, device, max_batches=None):
    global model
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0
    batch_count = 0
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        batch_true_labels = batch['labels'].numpy().flatten().tolist()
        batch = {k:v.to(device) for k,v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            total_loss += loss.item()
        
        batch_predictions = logits.argmax(axis=-1).flatten().tolist()
        
        predictions += batch_predictions
        true_labels += batch_true_labels
        
        batch_count += 1
        if max_batches and batch_count >= max_batches:
            break
    
    avg_epoch_loss = total_loss / batch_count
    
    return predictions, true_labels, avg_epoch_loss

def predict(dataloader, device):
    global model
    model.eval()
    predictions_labels = []
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            _, logits = outputs[:2]
            predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    return predictions_labels

In [7]:
print('Setting config...')
model_config = GPT2Config.from_pretrained('gpt2', num_labels=num_labels)

print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=model_config)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.eos_token_id
model.to(device)

Setting config...
Loading tokenizer...
Loading model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=7, bias=False)
)

In [8]:
gpt2_collator = GPT2_collator(tokenizer, max_length=max_length)

print('Creating datasets...')

train_processed = preprocess(ds['java_train']['combo'], np.argmax(ds['java_train']['labels'], axis=1))
train_dataloader = torch.utils.data.DataLoader(train_processed, batch_size=batch_size, shuffle=True, collate_fn=gpt2_collator)


eval_processed = preprocess(ds['java_test']['combo'], np.argmax(ds['java_test']['labels'], axis=1))
eval_dataloader = torch.utils.data.DataLoader(eval_processed, batch_size=batch_size, shuffle=True, collate_fn=gpt2_collator)

Creating datasets...


  0%|          | 0/7614 [00:00<?, ?it/s]

  0%|          | 0/1725 [00:00<?, ?it/s]

In [9]:
optimizer = AdamW(model.parameters(), lr = 5e-5, eps = 1e-8, weight_decay=0.01)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss = []
accuracy = []
eval_loss_list = []
eval_accuracy_list = []

max_batches = None

for epoch in tqdm(range(num_epochs)):
    train_labels, true_labels, train_loss = train(train_dataloader, optimizer, scheduler, device, max_batches=max_batches)
    train_acc = accuracy_score(true_labels, train_labels)
    print(f'Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss}, Train Accuracy: {train_acc}')
    loss.append(train_loss)
    accuracy.append(train_acc)
    
    eval_labels, true_labels, eval_loss = validate(eval_dataloader, device, max_batches=max_batches)
    eval_acc = accuracy_score(true_labels, eval_labels)
    print(f'Epoch {epoch+1}/{num_epochs} - Eval Loss: {eval_loss}, Eval Accuracy: {eval_acc}')
    eval_loss_list.append(eval_loss)
    eval_accuracy_list.append(eval_acc)
    



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/952 [00:00<?, ?it/s]

Epoch 1/1 - Train Loss: 0.6668430192262998, Train Accuracy: 0.7949829261885999


  0%|          | 0/216 [00:00<?, ?it/s]

Epoch 1/1 - Eval Loss: 0.6503709780949133, Eval Accuracy: 0.8156521739130435


In [10]:
# save model
model.save_pretrained('./models/gpt2-java')
tokenizer.save_pretrained('./models/gpt2-java')

('./models/gpt2-java\\tokenizer_config.json',
 './models/gpt2-java\\special_tokens_map.json',
 './models/gpt2-java\\vocab.json',
 './models/gpt2-java\\merges.txt',
 './models/gpt2-java\\added_tokens.json')