# Training and Testing with GPT-2


In [1]:
import pandas as pd
data=pd.read_csv('/content/training_contributions.csv')
data=data.rename(columns={'Text': 'text', 'Label': 'label'})

In [2]:

import numpy as np
import pandas as pd
import re
import string
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          AdamW,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification)

dataset = data


In [3]:
from sklearn.preprocessing import LabelEncoder

# Assuming selected_rows is your DataFrame and 'conference' is the column to be encoded
encoder = LabelEncoder()
dataset['label'] = encoder.fit_transform(dataset['label'])

In [4]:
print(dataset)

                                                   text  label
0     propose combinatorial visual analytics system ...      3
1     performed transcriptome analysis glomerular ti...      4
2     conducted vitro vivo experiment investigate ro...      5
3     report accumulation actinoplanic acid novel de...      5
4     identified apl cluster bacterial genome withou...      5
...                                                 ...    ...
5020  applied real world data bult simple questionan...      1
5021  paper present firstever result applying statis...      1
5022  employed two model one extracted adapted bbns ...      1
5023  data produced system used several task trainin...      1
5024           estimating coverage haridcrafted grammar      1

[5025 rows x 2 columns]


# Divide dataset in train and validation

In [5]:

dataset_copy = dataset.copy()

dataset = dataset_copy.sample(frac=0.80, random_state=0)
val_dataset = dataset_copy.drop(dataset.index)

# Definition of Parameters

In [6]:

max_len = None # Max lenght of the text for input
batch_size = 32
epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Functions and classes

In [7]:
# Dataset creator for Pytorch
class DatasetCreator(Dataset):
    def __init__(self, processed_data, train):
        self.data = processed_data
        self.train = train

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        line = self.data.iloc[index]
        if self.train:
            return {'text': line['text'], 'label': line['label']}
        else:
            return {'text': line['text'], 'label': 0}

# Class to tokenize and process the text for input to the dataloader
class GPT2_collator(object):
    def __init__(self, tokenizer, max_seq_len=None):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        return

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [int(sequence['label']) for sequence in sequences]
        inputs = self.tokenizer(text=texts,
                                return_tensors='pt',
                                padding=True,
                                truncation=True,
                                max_length=self.max_seq_len)
        inputs.update({'labels': torch.tensor(labels)})
        return inputs


def train(dataloader, optimizer, scheduler, device):
    global model
    model.train()
    predictions_labels = []
    true_labels = []
    total_loss = 0

    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].cpu().numpy().flatten().tolist()  # Move to CPU for data retrieval
        batch = {k: v.to(device) for k, v in batch.items()}  # Send batch to device
        optimizer.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        predictions_labels += logits.argmax(axis=-1).cpu().numpy().flatten().tolist()  # Move predictions to CPU
        torch.cuda.empty_cache()  # Clear cache to manage memory

    avg_epoch_loss = total_loss / len(dataloader)
    return predictions_labels, true_labels, avg_epoch_loss


# Function for validation
def validate(dataloader, device):
    global model
    model.eval()
    predictions_labels = []
    true_labels = []
    total_loss = 0

    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            total_loss += loss.item()
            predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    avg_epoch_loss = total_loss / len(dataloader)
    return predictions_labels, true_labels, avg_epoch_loss

def predict(dataloader, device):
    global model
    model.eval()
    predictions_labels = []

    for batch in tqdm(dataloader, total=len(dataloader)):
        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            _, logits = outputs[:2]
            predictions_labels += logits.argmax(axis=-1).flatten().tolist()
    return predictions_labels

# Load Model and Tokenizer

In [8]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
print('Loading gpt-2 model')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path='gpt2', num_labels=6)

print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path='gpt2')
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path='gpt2', config=model_config)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
model.to(device)

Loading gpt-2 model
Loading tokenizer...
Loading model...


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=6, bias=False)
)

# Prepare dataloader

In [9]:
gpt2_collator = GPT2_collator(tokenizer=tokenizer, max_seq_len=max_len)


train_data = DatasetCreator(dataset, train=True)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=gpt2_collator)

val_data = DatasetCreator(val_dataset, train=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=gpt2_collator)

# Train and validate model

In [10]:
optimizer = AdamW(model.parameters(), lr = 5e-5, eps = 1e-8, weight_decay=0.01)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
loss = []
accuracy = []
val_loss_list = []
val_accuracy_list = []

for epoch in tqdm(range(epochs)):
    train_labels, true_labels, train_loss = train(train_dataloader, optimizer, scheduler, device)
    train_acc = accuracy_score(true_labels, train_labels)
    print('epoch: %.2f train accuracy %.2f' % (epoch, train_acc))
    loss.append(train_loss)
    accuracy.append(train_acc)

    val_labels, val_true_labels, val_loss = validate(val_dataloader, device)
    val_acc= accuracy_score(val_true_labels, val_labels)
    print('epoch: %.2f validation accuracy %.2f' % (epoch, val_acc))
    val_loss_list.append(val_loss)
    val_accuracy_list.append(val_acc)



  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 0.00 train accuracy 0.32


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 0.00 validation accuracy 0.46


  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 1.00 train accuracy 0.52


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 1.00 validation accuracy 0.50


  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 2.00 train accuracy 0.56


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 2.00 validation accuracy 0.57


  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 3.00 train accuracy 0.61


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 3.00 validation accuracy 0.57


  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 4.00 train accuracy 0.65


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 4.00 validation accuracy 0.56


  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 5.00 train accuracy 0.67


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 5.00 validation accuracy 0.57


  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 6.00 train accuracy 0.70


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 6.00 validation accuracy 0.57


  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 7.00 train accuracy 0.73


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 7.00 validation accuracy 0.57


  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 8.00 train accuracy 0.75


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 8.00 validation accuracy 0.56


  0%|          | 0/126 [00:00<?, ?it/s]

epoch: 9.00 train accuracy 0.77


  0%|          | 0/32 [00:00<?, ?it/s]

epoch: 9.00 validation accuracy 0.56


In [11]:

test = pd.read_csv('/content/testing_contributions.csv')
test=test.rename(columns={'Text': 'text'})
test.reset_index()
test_dataset = test
test_dataset = DatasetCreator(test_dataset, train=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_collator)
#pd.Series(predict(test_dataloader, device))

submission = test
submission['target'] = pd.Series(predict(test_dataloader, device))
replace_dict={ 0:'Algorithms/ Methods Construction or Optimization', 1:'Applications',2:'Dataset Creation or Resources',3:'Model Construction or Optimization',
              4:'Performance Evaluation',5:'Theory Proposal'}
submission['target'] = submission['target'].map(replace_dict)
submission.to_csv('submission.csv', index=False)

  0%|          | 0/94 [00:00<?, ?it/s]

In this project,  we undertook the task of fine-tuning the SciBERT model for a specific text classification objective. The initial steps involved loading the necessary SciBERT tokenizer and model using the Hugging Face Transformers library. Subsequently, the dataset, comprising textual information, underwent tokenization and padding to a standardized maximum length of 512 tokens. To facilitate model training, the data was partitioned into training and testing sets with an 80-20 split ratio, followed by encoding categorical labels into numerical format using the LabelEncoder. PyTorch DataLoader objects were then instantiated to enable efficient batch processing during both the training and testing phases.
The fine-tuning process employed the AdamW optimizer with a learning rate of 2e-5 over a span of three training epochs. To enhance the model's stability during training, a linear learning rate scheduler was implemented with zero warm-up steps. Additionally, a gradient clipping technique was applied to address potential issues related to exploding gradients. The training loop closely monitored the average training loss after each epoch. Validation on a separate dataset was performed at the end of each epoch, enabling the identification and retention of the best-performing model based on the highest validation accuracy.
For the subsequent evaluation on the test set, the trained model was utilized to generate predictions, which were then compared with the ground truth labels. The resulting accuracy on the test set was found to be approximately 58.11%. These outcomes offer insights into the model's proficiency in handling the specific text classification task under consideration.
Key parameters that played a crucial role in the experiment included a maximum token length of 512, an AdamW optimizer with a learning rate of 2e-5, and a batch size of 8. The training regimen consisted of three epochs, and a linear learning rate scheduler was employed. The application of gradient clipping during training aimed to mitigate potential challenges associated with gradient explosions.
