## NOTES:
[GPT-2 Classification Getting Started](https://www.kaggle.com/code/andres6garzon/getting-started-nlp-classification-using-gpt-2)

[GPT-2 (Medium)](https://huggingface.co/openai-community/gpt2-medium)

In [1]:
import pandas as pd
import time
from datasets import Dataset, load_dataset
import numpy as np
import torch
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, classification_report
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)
import re

Using TensorFlow backend.





In [2]:
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')

In [3]:
ds['java_train'][0]

{'index': 0,
 'class': 'Abfss.java',
 'comment_sentence': 'azure blob file system implementation of abstractfilesystem.',
 'partition': 0,
 'combo': 'azure blob file system implementation of abstractfilesystem. | Abfss.java',
 'labels': [1, 0, 0, 0, 0, 0, 0]}

In [4]:
truncate_length = 256

def preprocess(dataset):
    # Create dataset object
    output = []
    
    for i in tqdm(range(len(dataset))):
        
        text = dataset[i]['combo']
        
        # remove entirety of html lists
        # text = re.sub(r'<ol>[.\s\S]*?<\/ol>', '', text)
        
        # remove html tags
        text = re.sub(r'<.*?>', '', text)
        
        # remove bullets
        text = re.sub(r'\s\*', '', text)
        
        # remove bulleted lines
        #text = re.sub(r'\n\s*\*.*', '', text)
        
        # remove curly braced sections
        text = re.sub(r'\{.*?\}', '', text)
        
        # remove // comments
        text = re.sub(r'\s*\/\/.*', '', text)
        
        # remove formatting for /* */ comments
        text = re.sub(r'\/\*.|\*\/', '', text)
        
        # remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        
        # truncate
        if (len(text) > truncate_length * 2):
            text = text[:(truncate_length-4)] + ' ... ' + text[-(truncate_length-4):]
        
        output.append({
            'text': text,
            'label': np.argmax(dataset[i]['labels'])
        })
    
    output = pd.DataFrame(output)
    output = Dataset.from_pandas(output)
    
    return output

In [5]:
length = 0
index = -1
i = 0

test_ds = preprocess(ds['java_train'])

for data in test_ds:
    if len(data['text']) > length:
        length = len(data['text'])
        index = i
    i += 1
        
print(f"Old length: {len(ds['java_train'][index]['combo'])}")
print(f"New length: {length}")
print('='*100)
print(ds['java_train'][index]['combo'])
print('='*100)
print(test_ds[index]['text'])

  0%|          | 0/7614 [00:00<?, ?it/s]

Old length: 738
New length: 511
A builder for creating immutable bimap instances, especially {@code public
   * static final} bimaps ("constant bimaps"). Example: <pre>   {@code
   *
   *   static final ImmutableBiMap<String, Integer> WORD_TO_INT =
   *       new ImmutableBiMap.Builder<String, Integer>()
   *           .put("one", 1)
   *           .put("two", 2)
   *           .put("three", 3)
   *           .build();}</pre>
   *
   * <p>For <i>small</i> immutable bimaps, the {@code ImmutableBiMap.of()} methods
   * are even more convenient.
   *
   * <p>Builder instances can be reused - it is safe to call {@link #build}
   * multiple times to build multiple bimaps in series. Each bimap is a superset
   * of the bimaps created before it. | ImmutableBiMap.java
A builder for creating immutable bimap instances, especially {@code public static final} bimaps ("constant bimaps"). Example: {@code static final ImmutableBiMap WORD_TO_INT = new ImmutableBiMap.Builder() .put("one", 1) .put("two"

In [6]:
def evaluate_model(model):
    total_flops = 0
    total_time = 0
    scores = []
    for lan in langs:
        with torch.profiler.profile(with_flops=True) as p:
            begin = time.time()
            for i in range(10):
                y_pred = model(ds[f'{lan}_test']['combo']).numpy().T
            total = time.time() - begin
            total_time = total_time + total
        total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)
        y_true = np.array(ds[f'{lan}_test']['labels']).T
        for i in range(len(y_pred)):
            assert(len(y_pred[i]) == len(y_true[i]))
            tp = sum([true == pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
            tn = sum([true == pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
            fp = sum([true == 0 and pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
            fn = sum([true == 1 and pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = (2*tp) / (2*tp + fp + fn)
            scores.append({'lan': lan, 'cat': labels[lan][i],'precision': precision,'recall': recall,'f1': f1})
    print("Compute in GFLOPs:", total_flops/10)
    print("Avg runtime in seconds:", total_time/10)
    scores = pd.DataFrame(scores)
    return scores

## Getting Started

In [7]:
# Params
seed = 42
batch_size = 4
max_length = 512
num_labels = 7
num_epochs = 1

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [9]:
class DatasetCreator(Dataset):
    def __init__(self, ds, train):
        self.ds = ds
        self.train = train
    
    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, idx):
        if self.train:
            return {'text': self.ds[idx]['text'], 'label': self.ds[idx]['label']}
        else:
            return {'text': self.ds[idx]['text'], 'label': 0}

class GPT2_collator(object):
    def __init__(self, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [sequence['label'] for sequence in sequences]
        
        inputs = self.tokenizer(text=texts, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        inputs.update({'labels': torch.tensor(labels)})
        
        return inputs
# =================================================================================================================================
def train(dataloader, optimizer, scheduler, device, max_batches=None):
    global model
    model.train()
    predictions = []
    true_labels = []
    total_loss = 0
    batch_count = 0
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        batch_true_labels = batch['labels'].numpy().flatten().tolist()
        batch = {k:v.to(device) for k,v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        batch_predictions = logits.argmax(axis=-1).flatten().tolist()
        
        predictions += batch_predictions
        true_labels += batch_true_labels
        
        batch_count += 1
        if max_batches and batch_count >= max_batches:
            break
    
    avg_epoch_loss = total_loss / batch_count
    
    return predictions, true_labels, avg_epoch_loss

def validate(dataloader, device, max_batches=None):
    global model
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0
    batch_count = 0
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        batch_true_labels = batch['labels'].numpy().flatten().tolist()
        batch = {k:v.to(device) for k,v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            total_loss += loss.item()
        
        batch_predictions = logits.argmax(axis=-1).flatten().tolist()
        
        predictions += batch_predictions
        true_labels += batch_true_labels
        
        batch_count += 1
        if max_batches and batch_count >= max_batches:
            break
    
    avg_epoch_loss = total_loss / batch_count
    
    return predictions, true_labels, avg_epoch_loss

def predict(dataloader, device):
    global model
    model.eval()
    predictions_labels = []
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            logits = outputs[0]
            predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    return predictions_labels

In [10]:
print('Setting config...')
model_config = GPT2Config.from_pretrained('gpt2', num_labels=num_labels)

print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=model_config)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.eos_token_id
model.to(device)

Setting config...
Loading tokenizer...
Loading model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=7, bias=False)
)

In [11]:
gpt2_collator = GPT2_collator(tokenizer, max_length=max_length)

print('Creating datasets...')

train_dataset = DatasetCreator(preprocess(ds['java_train']), True)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_collator)

eval_dataset = DatasetCreator(preprocess(ds['java_test']), True)
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_collator)

Creating datasets...


  0%|          | 0/7614 [00:00<?, ?it/s]

  0%|          | 0/1725 [00:00<?, ?it/s]

In [12]:
optimizer = AdamW(model.parameters(), lr = 5e-5, eps = 1e-8, weight_decay=0.01)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss = []
accuracy = []
eval_loss_list = []
eval_accuracy_list = []

max_batches = 100

for epoch in tqdm(range(num_epochs)):
    train_labels, true_labels, train_loss = train(train_dataloader, optimizer, scheduler, device, max_batches=max_batches)
    train_acc = accuracy_score(true_labels, train_labels)
    print(f'Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss}, Train Accuracy: {train_acc}')
    loss.append(train_loss)
    accuracy.append(train_acc)
    
    eval_labels, true_labels, eval_loss = validate(eval_dataloader, device, max_batches=max_batches)
    print(eval_labels)
    eval_acc = accuracy_score(true_labels, eval_labels)
    print(f'Epoch {epoch+1}/{num_epochs} - Eval Loss: {eval_loss}, Eval Accuracy: {eval_acc}')
    eval_loss_list.append(eval_loss)
    eval_accuracy_list.append(eval_acc)
    



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1904 [00:00<?, ?it/s]

Epoch 1/1 - Train Loss: 1.5365825262665749, Train Accuracy: 0.55


  0%|          | 0/432 [00:00<?, ?it/s]

[0, 0, 3, 3, 0, 3, 3, 3, 0, 0, 3, 3, 0, 3, 3, 3, 0, 3, 3, 0, 3, 0, 3, 0, 3, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 3, 0, 0, 0, 3, 0, 3, 0, 0, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 3, 0, 0, 3, 0, 3, 0, 3, 3, 0, 3, 0, 3, 0, 3, 0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 3, 0, 0, 0, 0, 3, 0, 3, 3, 3, 0, 3, 0, 0, 3, 3, 0, 0, 0, 3, 3, 3, 0, 0, 0, 3, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 3, 3, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 3, 0, 3, 3, 3, 0, 3, 0, 0, 0, 3, 3, 0, 0, 3, 3, 3, 0, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 