In [20]:
import numpy as np
import pandas as pd
import json
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, classification_report
from sklearn.preprocessing import label_binarize
import os

In [2]:
# Read JSONL files
def read_jsonl(file_path):
    records = []
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            records.append(json.loads(line))
    return pd.DataFrame(records)

# Read datasets
train_df = read_jsonl('../input/scicite/train.jsonl')
dev_df = read_jsonl('../input/scicite/dev.jsonl')
test_df = read_jsonl('../input/scicite/test.jsonl')

In [3]:
# Define the mapping of labels to integers
label_to_int = {
    'background': 0,
    'method': 1,
    'result': 2
}

# Convert labels to integers
train_df['label'] = train_df['label'].map(label_to_int)
dev_df['label'] = dev_df['label'].map(label_to_int)
test_df['label'] = test_df['label'].map(label_to_int)

In [4]:
# Instantiate the configuration for model
configuration = GPT2Config()

# Set up tokenizer and  pad token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Instantiate the model
model = GPT2ForSequenceClassification(configuration).from_pretrained('gpt2', num_labels=3)

# Set the pad token of the model's configuration
model.config.pad_token_id = model.config.eos_token_id

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Encode datasets
class CitationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
def encode_data(tokenizer, texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
    return CitationDataset(encodings, labels)

train_dataset = encode_data(tokenizer, train_df['string'].tolist(), train_df['label'].tolist())
dev_dataset = encode_data(tokenizer, dev_df['string'].tolist(), dev_df['label'].tolist())
test_dataset = encode_data(tokenizer, test_df['string'].tolist(), test_df['label'].tolist())

In [6]:
try:
    from accelerate import Accelerator
    print("Accelerate is installed and can be imported.")
except ImportError as e:
    print("Accelerate is not installed or there's an issue with importing it.")

Accelerate is installed and can be imported.


In [21]:
# Define training parameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

In [22]:
# Set environment variable for PyTorch
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "max_split_size_mb:50"

In [23]:
# Train
trainer.train()



Step,Training Loss
50,8.0603
100,1.3214
150,0.7306
200,0.5678
250,0.4671
300,0.4009
350,0.3935
400,0.3553
450,0.299
500,0.3446




TrainOutput(global_step=645, training_loss=1.0598640929820926, metrics={'train_runtime': 2382.4702, 'train_samples_per_second': 17.299, 'train_steps_per_second': 0.271, 'total_flos': 1.076944281403392e+16, 'train_loss': 1.0598640929820926, 'epoch': 5.0})

In [24]:
# Predict
predictions = trainer.predict(test_dataset)

# Extract predicted results and true labels
preds = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids
n_classes = np.max(true_labels) + 1
true_labels_binary = label_binarize(true_labels, classes=range(n_classes))

In [25]:
# Compute f1 score, accuracy, precision, recall and auc
f1 = f1_score(true_labels, preds, average='weighted')
accuracy = accuracy_score(true_labels, preds)
precision = precision_score(true_labels, preds, average='weighted')
recall = recall_score(true_labels, preds, average='weighted')
auc = roc_auc_score(true_labels_binary, predictions.predictions, multi_class='ovr')

print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'AUC (One-vs-Rest): {auc}')

F1 Score: 0.8630910607629326
Accuracy: 0.8613648576034391
Precision: 0.8689248386842947
Recall: 0.8613648576034391
AUC (One-vs-Rest): 0.9491705777414831


In [26]:
print('classification_report\n',classification_report(true_labels, preds))

classification_report
               precision    recall  f1-score   support

           0       0.90      0.86      0.88       997
           1       0.89      0.85      0.87       605
           2       0.70      0.90      0.79       259

    accuracy                           0.86      1861
   macro avg       0.83      0.87      0.84      1861
weighted avg       0.87      0.86      0.86      1861

