In [1]:
import pandas as pd
from datasets import Dataset
import pickle
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM
import torch
import numpy as np
import torch.nn as nn


import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

pl.seed_everything(42)

import warnings
warnings.filterwarnings("ignore")

from evaluate import load
sari = load("sari")

from readability import Readability

Seed set to 42


In [2]:
data_location = './data/wikilarge/'
#training_args = TrainingArguments("test=trainer", evaluation_strategy="epoch")#TrainingArguments(output_dir=f"{data_location}training_args")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

In [3]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy='steps',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=torch.cuda.is_available(),
    output_dir='./out',
    logging_steps=100,
    save_steps=3000,
    eval_steps=10000,
    warmup_steps=2000,
    gradient_accumulation_steps=1,
    save_total_limit=3,
    include_inputs_for_metrics = True,
)


In [4]:
grade_ratio = pd.read_csv(f'{data_location}grade_ratio_wiki_train.csv')
# source texts
with open(f'{data_location}wiki_train.src', 'r', encoding='utf-8') as f:
    train_src = f.readlines()
train_src = pd.DataFrame(train_src, columns=['source'])
# target texts
with open(f'{data_location}wiki_train.tgt', 'r', encoding='utf-8') as f:
    train_tgt = f.readlines()
train_tgt = pd.DataFrame(train_tgt, columns=['target'])
train_texts = pd.concat([train_src, grade_ratio['abs_src_FKGL_Grade'], train_tgt, grade_ratio['abs_tgt_FKGL_Grade']], axis=1)
train_texts.rename(columns={'abs_src_FKGL_Grade': 'source_grade', 'abs_tgt_FKGL_Grade': 'target_grade'}, inplace=True)
train_texts['souce'] = train_texts['source'].replace(r'\n',' ', regex=True)
train_texts['target'] = train_texts['target'].replace(r'\n',' ', regex=True)

In [5]:
train_texts = pd.read_pickle(f'{data_location}train_texts.pkl')
train_texts.iloc[0]['source']

'Heinrich Luitpold Himmler (7 October 1900 - 23 May 1945) was Chief of the German Police and Minister of the Interior.\n'

In [6]:
grade_groups = train_texts.groupby(['target_grade'])
grade_groups.get_group(0)

Unnamed: 0,source,source_grade,target,target_grade
5,"Though founded in 1887, under Jack Hyles' lead...",13,Dr. Jack Hyles\n,0
121,"On January 27, 2008, at the NHL All-Star Game ...",7,Records\n,0
130,Gone the times when nations battled for this' ...,8,gone the days when strife and discord.\n,0
152,May 17 & ndash; The conflict between Toyotomi ...,10,Ghent falls to the Spanish.\n,0
172,Some subjects that are discussed have criminal...,11,(see and).\n,0
...,...,...,...,...
216813,"Dubnium (,) is a chemical element with the sym...",10,It has the symbol Db.\n,0
216828,WWE Hell in a Cell is a professional wrestling...,13,Hell\n,0
216833,He died of a heart attack in 1968 and was hono...,8,He died from a heart attack in 1968.\n,0
216839,"In English, the name is sometimes spelled Bela...",10,-)\n,0


In [7]:
# create custom dataset where each grade group is a separate dataset, including source, target, and target grade
datasets = {}
for i, (grade, group) in enumerate(grade_groups):
    datasets[i] = Dataset.from_pandas(group[['source', 'target', 'target_grade']]).train_test_split(test_size=0.2)

datasets[6]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__'],
        num_rows: 800
    })
    test: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__'],
        num_rows: 200
    })
})

In [8]:
datasets[6]['train'][0]

{'source': 'Thus, taekwondo may be loosely translated as "the art of the foot and fist" or "the art of kicking and punching."\n',
 'target': 'Taekwondo can therefore be translated as "way of the foot and of the fist".\n',
 'target_grade': 6,
 '__index_level_0__': 6800}

In [9]:
def tokenize_function(examples):
    return tokenizer(text=examples["source"], text_target=examples['target'], padding="max_length", max_length=64, truncation=True, return_tensors="pt")

tokenized_dataset_6 = datasets[6].map(tokenize_function, batched=True)
tokenized_dataset_6

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
for feature in tokenized_dataset_6['train'][1]:
    if feature not in {'attention_mask', 'target_grade', 'text', 'text_target'}:
        print([tokenized_dataset_6['train'][1][feature]])

['The female alternatives are Khatun and Khanum.\n']
['The female alternative are Khatun and Khanum.\n']
[4463]
[[464, 4048, 14693, 389, 509, 5183, 403, 290, 11356, 388, 13, 198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]
[[464, 4048, 5559, 389, 509, 5183, 403, 290, 11356, 388, 13, 198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]


In [12]:
def compute_metrics(prediction):
    source_ids = prediction.input_ids
    labels_ids = prediction.label_ids
    pred_ids = prediction.predictions
    

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    source_str = tokenizer.batch_decode(source_ids, skip_special_tokens=True)

    return sari.compute(sources=source_str, references=label_str, predictions=pred_str)



trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_6['train'],
    eval_dataset=tokenized_dataset_6['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()
trainer.save_model('./models/gpt2_wikilarge_6')

KeyboardInterrupt: 

In [28]:
input_text = '''
Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties 
of nature at the scale of atoms and subatomic particles. It is the foundation of all quantum physics including 
quantum chemistry, quantum field theory, quantum technology, and quantum information science
'''

trained_model = AutoModelForCausalLM.from_pretrained('./models/gpt2_wikilarge_6')
inputs = tokenizer([input_text], padding='max_length',
                       max_length=60, truncation=True, return_tensors='pt')

trained_model.config.decoder_start_token_id = tokenizer.cls_token_id
trained_model.config.eos_token_id = tokenizer.sep_token_id
trained_model.config.pad_token_id = tokenizer.pad_token_id
#trained_model.config.vocab_size = model.config.encoder.vocab_size

output = trained_model.generate(inputs['input_ids'],
                        max_length=100,
                        min_length=30,
                        num_beams=1,
                        length_penalty=0.8,
                        temperature=1.0,
                        early_stopping=True,
                        top_k=50,
                        do_sample=False)

text = tokenizer.batch_decode(output, skip_special_tokens=True)
print(input_text)
print(text[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties 
of nature at the scale of atoms and subatomic particles. It is the foundation of all quantum physics including 
quantum chemistry, quantum field theory, quantum technology, and quantum information science


Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties 
of nature at the scale of atoms and subatomic particles. It is the foundation of all quantum physics including 
quantum chemistry, quantum field theory, quantum technology, and quantum information science



In [24]:
r = Readability(text[0]*3)

r.flesch_kincaid().score

20.093913043478263