# Transformers - fine tuning GPT-2 on CORD-19 abstracts

### Get Huggingface's transformers, import it and other things in Python

In [0]:
!git clone https://github.com/huggingface/transformers

import os
os.chdir('/content/transformers')

!pip install .
!pip install -r ./examples/requirements.txt

os.chdir('/content/transformers/examples')

!pip install dict_to_obj

In [0]:
import torch
import random
import collections
import numpy as np
import run_generation
import tensorflow as tf
import run_language_modeling
from dict_to_obj import DictToObj
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, TFGPT2LMHeadModel

### Download Abstracts and GPT-2, start fine-tuning

In [0]:
!wget -nc -O /content/abstracts_test.txt https://raw.githubusercontent.com/PubChimps/CORD-19/master/abstractstest.txt
!wget -nc -O /content/abstracts_train.txt https://raw.githubusercontent.com/PubChimps/CORD-19/master/abstractstrain.txt

In [0]:
!python run_language_modeling.py \
    --output_dir='./output' \
    --model_type=gpt2 \
    --model_name_or_path=gpt2 \
    --num_train_epochs=1.0 \
    --do_train \
    --evaluate_during_training \
    --train_data_file=/content/abstracts_train.txt \
    --do_eval \
    --eval_data_file=/content/abstracts_test.txt \
    --per_gpu_train_batch_size=2 \
    --per_gpu_eval_batch_size=2 \
    --block_size=512 \
    --gradient_accumulation_steps=5

### Generate Abstracts

In [0]:
model = TFGPT2LMHeadModel.from_pretrained('./test/checkpoint-1000/', pad_token_id=tokenizer.eos_token_id, from_pt=True)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

input_ids = tokenizer.encode('Abstract\n\nCovid-19', return_tensors='tf')
sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=150, 
    top_k=50, 
    top_p=0.92, 
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))