In [1]:
# !pip install datasets
# !pip install pytorch_lightning
# !pip install py-readability-metrics
# !python -m nltk.downloader punkt
# !pip install evaluate
# !pip install sacremoses sacrebleu
# !pip install accelerate



In [29]:
import pandas as pd
from datasets import Dataset
import pickle
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM
import torch
import numpy as np
import torch.nn as nn


import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

pl.seed_everything(42)

import warnings
warnings.filterwarnings("ignore")

from evaluate import load
sari = load("sari")

from readability import Readability

Seed set to 42


In [30]:
data_location = './data/wikilarge/'
#training_args = TrainingArguments("test=trainer", evaluation_strategy="epoch")#TrainingArguments(output_dir=f"{data_location}training_args")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name,
    max_new_tokens=1024
)

In [31]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    fp16=torch.cuda.is_available(),
    output_dir='./out',
    logging_steps=100,
    save_steps=3000,
    eval_steps=10000,
    warmup_steps=2000,
    gradient_accumulation_steps=1,
    save_total_limit=3,
    include_inputs_for_metrics = True,
)


In [33]:
grade_ratio = pd.read_csv(f'{data_location}grade_ratio_wiki_train.csv')
# source texts
with open(f'{data_location}wiki_train.src', 'r', encoding='utf-8') as f:
    train_src = f.readlines()
train_src = pd.DataFrame(train_src, columns=['source'])
# target texts
with open(f'{data_location}wiki_train.tgt', 'r', encoding='utf-8') as f:
    train_tgt = f.readlines()
train_tgt = pd.DataFrame(train_tgt, columns=['target'])
train_texts = pd.concat([train_src, grade_ratio['abs_src_FKGL_Grade'], train_tgt, grade_ratio['abs_tgt_FKGL_Grade']], axis=1)
train_texts.rename(columns={'abs_src_FKGL_Grade': 'source_grade', 'abs_tgt_FKGL_Grade': 'target_grade'}, inplace=True)
train_texts['souce'] = train_texts['source'].replace(r'\n',' ', regex=True)
train_texts['target'] = train_texts['target'].replace(r'\n',' ', regex=True)

In [6]:
train_texts = pd.read_pickle(f'{data_location}train_texts.pkl')
train_texts.iloc[0]['source']

'Heinrich Luitpold Himmler (7 October 1900 - 23 May 1945) was Chief of the German Police and Minister of the Interior.\n'

In [7]:
grade_groups = train_texts.groupby(['target_grade'])
grade_groups.get_group(0)

Unnamed: 0,source,source_grade,target,target_grade
5,"Though founded in 1887, under Jack Hyles' lead...",13,Dr. Jack Hyles\n,0
121,"On January 27, 2008, at the NHL All-Star Game ...",7,Records\n,0
130,Gone the times when nations battled for this' ...,8,gone the days when strife and discord.\n,0
152,May 17 & ndash; The conflict between Toyotomi ...,10,Ghent falls to the Spanish.\n,0
172,Some subjects that are discussed have criminal...,11,(see and).\n,0
...,...,...,...,...
216813,"Dubnium (,) is a chemical element with the sym...",10,It has the symbol Db.\n,0
216828,WWE Hell in a Cell is a professional wrestling...,13,Hell\n,0
216833,He died of a heart attack in 1968 and was hono...,8,He died from a heart attack in 1968.\n,0
216839,"In English, the name is sometimes spelled Bela...",10,-)\n,0


In [8]:
# create custom dataset where each grade group is a separate dataset, including source, target, and target grade
datasets = {}
for i, (grade, group) in enumerate(grade_groups):
    datasets[i] = Dataset.from_pandas(group[['source', 'target', 'target_grade']]).train_test_split(test_size=0.2)

datasets[6]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__'],
        num_rows: 20313
    })
    test: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__'],
        num_rows: 5079
    })
})

In [9]:
datasets[6]['train'][0]

{'source': 'Although he died young, he became one of the pioneers of the Romantic movement.\n',
 'target': 'He was one of the people who started the Romantic movement.\n',
 'target_grade': 6,
 '__index_level_0__': 191750}

In [10]:
def tokenize_function(examples):
    return tokenizer(text=examples["source"], text_target=examples['target'], padding="max_length", max_length=256, truncation=True, return_tensors="pt")

tokenized_dataset_6 = datasets[6].map(tokenize_function, batched=True)
tokenized_dataset_6

Map:   0%|          | 0/20313 [00:00<?, ? examples/s]

Map:   0%|          | 0/5079 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20313
    })
    test: Dataset({
        features: ['source', 'target', 'target_grade', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5079
    })
})

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
for feature in tokenized_dataset_6['train'][1]:
    if feature not in {'attention_mask', 'target_grade', 'text', 'text_target'}:
        print([tokenized_dataset_6['train'][1][feature]])

["During his playing career he played for Cowdenbeath and Heart of Midlothian. He won 16 caps for Scotland and was part of his country's 1990 World Cup squad in Italy.\n"]
['During his playing career he played for Cowdenbeath F.C., Heart of Midlothian and won 16 caps for the Scotland national team.\n']
[180767]
[[7191, 465, 2712, 3451, 339, 2826, 329, 10417, 6559, 1350, 776, 290, 8894, 286, 7215, 75, 849, 666, 13, 679, 1839, 1467, 11022, 329, 8838, 290, 373, 636, 286, 465, 1499, 338, 6303, 2159, 5454, 8244, 287, 8031, 13, 198, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256

In [None]:
def compute_metrics(prediction):
    source_ids, pred_ids, labels_ids = prediction


    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    source_str = tokenizer.batch_decode(source_ids, skip_special_tokens=True)

    return sari.compute(sources=source_str, references=label_str, predictions=pred_str)



trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_6['train'],
    eval_dataset=tokenized_dataset_6['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()
trainer.save_model('./models/gpt2_wikilarge_6')

Epoch,Training Loss,Validation Loss


In [None]:
input_text = '''
Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties
of nature at the scale of atoms and subatomic particles. It is the foundation of all quantum physics including
quantum chemistry, quantum field theory, quantum technology, and quantum information science
'''


trained_model = AutoModelForCausalLM.from_pretrained('./models/gpt2_wikilarge_6')
inputs = tokenizer([input_text], padding='max_length',
                       max_length=60, truncation=True, return_tensors='pt')

trained_model.config.decoder_start_token_id = tokenizer.cls_token_id
trained_model.config.eos_token_id = tokenizer.sep_token_id
trained_model.config.pad_token_id = tokenizer.pad_token_id
#trained_model.config.vocab_size = model.config.encoder.vocab_size

output = trained_model.generate(inputs['input_ids'],
                        max_length=100,
                        min_length=30,
                        num_beams=4,
                        length_penalty=0.8,
                        temperature=0.5,
                        early_stopping=True,
                        top_k=50,
                        do_sample=False)

text = tokenizer.batch_decode(output, skip_special_tokens=True)
print(input_text)
print(text[0])

In [None]:
!python -m nltk.downloader punkt
r = Readability(text[0]*3)

r.flesch_kincaid().score