In [1]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from itertools import chain
import pandas as pd
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!ls ./data

ai_wikipedia.html   embeddings.csv	   sorted_distances.csv
CVE-2020-29583.txt  finetune_gpt2	   text.csv
distances.csv	    shakespeare_small.txt  xss.txt


In [3]:
# Load the file names into a list
data_path = Path("./data")
file_paths = [filename for filename in data_path.glob("*.txt")]
file_paths

[PosixPath('data/shakespeare_small.txt'),
 PosixPath('data/CVE-2020-29583.txt'),
 PosixPath('data/xss.txt')]

In [4]:
# Read all the file contents into a list
file_data = list()
for filename in file_paths:
    with open(filename, "r") as f:
        data = f.read()
    file_data.append(data)

In [5]:
for fl in file_data:
    print(fl)
    print('===========================')

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
# Convert the list of text into a dataset using .from_dict()
dataset = Dataset.from_dict({"text": file_data})

In [7]:
dataset["text"]

Column(["First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in

In [8]:
# Load the tokenizer for GPT-2
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# The tokenizer does not have a pad token, so we'll specify one.
tokenizer.pad_token = tokenizer.eos_token

# Load the GPT-2 model
model = AutoModelForCausalLM.from_pretrained('gpt2')



In [9]:
# Create a tokenizer function to tokenize the dataset
def tokenizer_function(examples):
    output = tokenizer(examples['text'])
    return output

In [10]:
# Run the tokenizer function over the dataset using the .map method
tokenized_dataset = dataset.map(tokenizer_function, batched=True)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (15178 > 1024). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 3/3 [00:00<00:00, 130.35 examples/s]


In [11]:
tokenized_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 3
})

In [12]:
dataset

Dataset({
    features: ['text'],
    num_rows: 3
})

In [13]:
# Remove the origial dataset's column names from the tokenized_dataset
tokenized_dataset = tokenized_dataset.remove_columns(dataset.column_names)

In [14]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 3
})

In [15]:
# Create a preprocessing function to group the texts together in chuck of 1024
def group_texts(examples):
    # Specify the block size --- 1024
    block_size = 256
    
    # Concatenate all the texts together for each example
    concatenated_examples = dict()
    for k in examples.keys():
        concatenated_examples[k] = list(chain(*examples[k]))
    
    # Compute the total length of all the text
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    # Drop the small remainder of the block
    # If total_length < block_size, return an empty dict.clear
    total_length = (total_length // block_size) * block_size
    
    # Split into chuck of block size
    result = dict()
    # Loop over the keys and texts in the concatenated examples
    for k, t in concatenated_examples.items():
        # Divide each text into chunks of block size
        chunks = list()
        for i in range(0, total_length, block_size):
            chunks.append(t[i : i + block_size])
        result[k] = chunks
    # Set the "labels" equal to the "input_ids"
    result["labels"] = result["input_ids"].copy()
    return result

In [16]:
# Chunk the datasets using the group texts function
dataset = tokenized_dataset.map(group_texts, batched=True)

Map: 100%|██████████| 3/3 [00:00<00:00, 214.33 examples/s]


In [17]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 67
})

In [18]:
# Setup the data collator for training. Since the model is PyTorch, we need to define the return_tensors as "pt"
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

In [19]:
data_collator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [20]:
# Establish the training arguments
training_args = TrainingArguments(
    output_dir="data/finetune_gpt2",
    per_device_eval_batch_size=1,
    save_strategy="no"
)

In [21]:
# Put everything into the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

In [22]:
# Run the trainer
trainer.train()

***** Running training *****
  Num examples = 67
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 27
  Number of trainable parameters = 124439808
  0%|          | 0/27 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 27/27 [00:09<00:00,  3.18it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 27/27 [00:09<00:00,  2.76it/s]

{'train_runtime': 9.7957, 'train_samples_per_second': 20.519, 'train_steps_per_second': 2.756, 'train_loss': 3.91307039614077, 'epoch': 3.0}





TrainOutput(global_step=27, training_loss=3.91307039614077, metrics={'train_runtime': 9.7957, 'train_samples_per_second': 20.519, 'train_steps_per_second': 2.756, 'train_loss': 3.91307039614077, 'epoch': 3.0})

In [None]:
import torch

# Specify an input string
input_string = "Cross-Site Scripting is a vulnerability that"

# Tokenize the input string
input_ids = tokenizer(input_string, return_tensors="pt").input_ids

# Put model on GPU if available
device = "cpu"
model = model.to(device)


# Generate model output_ids
outputs = model.generate(
    input_ids,
    num_beams=10,
    num_return_sequences=1,
    no_repeat_ngram_size=1,
    remove_invalid_values=True,
)

In [24]:
# Decode the output tokens to text
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [25]:
output_text

'Cross-Site Scripting is a vulnerability that can be exploited by attackers to execute arbitrary code.\n'