<a href="https://colab.research.google.com/github/Slebbon/TextGeneration_Projet_PSL_EnC/blob/main/GPT2_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━

In [1]:
from datasets import load_dataset, DatasetDict

  ## Parameters & Setup

In [2]:
pretrained_model = "distilbert/distilgpt2"

In [3]:
#setup parameters regarding GPU availibility on the machine and recycle used memory
import torch;
import gc;

is_gpu_available = torch.cuda.is_available()
device = 'cuda' if is_gpu_available else 'cpu'
if is_gpu_available:
    print("GPU available for notebook")
    torch.cuda.empty_cache()
    print("GPU Memory cleaned")
else:
    print("No GPU available for notebook")

gc.collect()


GPU available for notebook
GPU Memory cleaned


0

## Dataset

In [4]:
dataset = load_dataset("/content/", data_files={
    'train': f'train.csv',
    'validation': f'validation.csv',
    'test': f'test.csv'
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Text', 'Author'],
        num_rows: 21054
    })
    validation: Dataset({
        features: ['Text', 'Author'],
        num_rows: 4513
    })
    test: Dataset({
        features: ['Text', 'Author'],
        num_rows: 4512
    })
})


  ## Tokenization

In [5]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(pretrained_model)
# Get the maximum context size
max_length = model.config.max_position_embeddings
print(f"Maximum context size: {max_length}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Maximum context size: 1024


## Get % of data

In [7]:
train_10 = dataset['train'].train_test_split(test_size=0.50)['test']
dataset['train'] = train_10

In [8]:
valid_10 = dataset['validation'].train_test_split(test_size=0.50)['test']
dataset['validation'] = valid_10

In [9]:
test_10 = dataset['test'].train_test_split(test_size=0.50)['test']
dataset['test'] = test_10

## Tokenize

In [6]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

def tokenize_function(examples):
    return tokenizer(examples["Text"],max_length=max_length)


# Apply the tokenization function to the entire dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=10,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/4513 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
sample_token = tokenizer.encode("Live long and prosper.")
print(sample_token)

[18947, 890, 290, 13983, 13]


## Data Collator

In [8]:
#We need to create data collator to manage the batches, we can use DataCollatorForLanguageModeling
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = "<pad>"
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)
# Iterate over the generator
out = data_collator([tokenized_dataset["train"][i] for i in range(1)])
for key in out:
    print(f"{key} shape: {out[key].shape}")



input_ids shape: torch.Size([1, 9])
attention_mask shape: torch.Size([1, 9])
labels shape: torch.Size([1, 9])


## Setup the Trainer

In [9]:
#Now we train the model using the Trainer API
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    'outputs',
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-3,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=is_gpu_available,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    save_steps = 500,
    eval_steps=500
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

## Evaluate the Performance of the Base Model

In [10]:
import math

#Calculate and report on perplexity
initial_results = trainer.evaluate()
print(initial_results)
#log the results to file
#logger.info(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")
print(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")


{'eval_loss': 5.541857719421387, 'eval_runtime': 8.539, 'eval_samples_per_second': 528.516, 'eval_steps_per_second': 66.167}
Baseline distilbert/distilgpt2 Results: Perplexity: 255.15


In [11]:
#setup our test prompts
test_prompt = "What is the meaning of life?"
test_prompt2 = "Where did that planet go??"
test_prompt3 = "What is the best way to cook a turkey?"

In [12]:
#Use the model in a pipeline to generate text.
from transformers import pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)


result = text_generator(test_prompt, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")
#logger.info(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")

Baseline distilbert/distilgpt2 generated result: What is the meaning of life?...What is the meaning of life? What you experience that way as a child? What you experience in your life is what you experience on a regular basis? (One important aspect of this study is what is most interesting is how you see the relationship between life and gender)


The researchers from the National Institute of Standards and Technology study. They are now studying the effects of the gender binary at birth, as well as in many key studies from around the world. The researchers used a set of


In [13]:
result = text_generator(test_prompt2, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt2}...{result[0]['generated_text']}")
#logger.info(f"Baseline {pretrained_model} generated result: {test_prompt2}...{result[0]['generated_text']}")

Baseline distilbert/distilgpt2 generated result: Where did that planet go??...Where did that planet go??????
In fact, all of the stars in our universe are orbiting a few kilometers away, and there are more than 100 million planets across your Universe today. It is a mystery whether you‿d do it because you didn‿d know how you did it, or was it not planned to, but to, and to, and especially, your entire life. If so, what do you do now?
In order to tell the truth, here


In [14]:
result = text_generator(test_prompt3, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt3}...{result[0]['generated_text']}")
#logger.info(f"Baseline {pretrained_model} generated result: {test_prompt3}...{result[0]['generated_text']}")

Baseline distilbert/distilgpt2 generated result: What is the best way to cook a turkey?...What is the best way to cook a turkey? I really want things to be the best quality that you can find. So, I used to make turkey at Chipotle, but it‬s pretty slow, so it‬s really easy to do it in a traditional way. If this ain't been the case then it is definitely a little different. I did this for Thanksgiving and hope my turkey stays where I am today that I'm going for it, so take it home if you


## Fine-Tune the Model

In [15]:
trainer.train()

Step,Training Loss,Validation Loss
500,5.9979,5.591518
1000,5.5114,5.37015
1500,5.329,5.199179
2000,5.1438,5.054142
2500,5.0288,4.977328
3000,4.5705,5.018639
3500,4.4143,4.991149
4000,4.3833,4.871944
4500,4.3397,4.834546
5000,4.3076,4.777462


Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 50, 'do_sample': True}
Non-default generation parameters: {'max_length': 5

TrainOutput(global_step=7896, training_loss=4.385359527371454, metrics={'train_runtime': 751.3917, 'train_samples_per_second': 84.06, 'train_steps_per_second': 10.509, 'total_flos': 715670365372416.0, 'train_loss': 4.385359527371454, 'epoch': 3.0})

## Evaluate the Performance of the Fine-Tuned Model

In [16]:
#Calculate and report on perplexity
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results['eval_loss'])
eval_results['perplexity'] = perplexity

#logger.info(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {perplexity:.2f}")
print(f"Fine-tuned results: Perplexity: {perplexity:.2f}")



Fine-tuned results: Perplexity: 118.80


In [17]:
eval_results

{'eval_loss': 4.777462005615234,
 'eval_runtime': 7.6922,
 'eval_samples_per_second': 586.697,
 'eval_steps_per_second': 73.451,
 'epoch': 3.0,
 'perplexity': 118.8024471491827}

In [18]:
#Prompt Test 1
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
result = text_generator(test_prompt, max_length=200, num_return_sequences=1,temperature=1)
print(f"Fine-tuned generated result: {test_prompt}...{result[0]['generated_text']}")
#logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt}...{result[0]['generated_text']}")

Fine-tuned generated result: What is the meaning of life?...What is the meaning of life? What's that. Please you? What's there! Come, look, here? look ye? no? no? I'll rail! we're a true! My friend, I do. You? What's a fool! I'll not good answer? O, then? how you. Now, nor a fool, there these? If the lie? Good. But, you know, by a good. What we? If it's anon's we's a man. What's not sure? What a word? What we'll I mean? I think so. But when when she's a lie? and the devil's the horse? no, I will we'll tell? How dost thou art thou art here's the matter to the gentleman? I said he's, and we can't take this, the letter? look you. You must I do us there's that's there's not? I think, thou'ld stand?


In [19]:
#Prompt Test 2
result = text_generator(test_prompt2, max_length=200, num_return_sequences=1,temperature=1)
print(f"Fine-tuned generated result: {test_prompt2}...{result[0]['generated_text']}")
#logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt2}...{result[0]['generated_text']}")

Fine-tuned generated result: Where did that planet go??...Where did that planet go?? Now at the bottom of this hour hence. What's that? I do? look how they are? Why? Nothing's here? How? I, she? Let's not?'s?" See. So, let my wife? oh me good!
@HillaryClinton!_Good, so. You're with all. So sad, you look? I think you see you're doing that? We're doing nothing, what they looked they have the thing, a great state? Why do? She? I think she was she was a lot of it! What a bad judgement see that. Good morrow, I said you see, and there was a little bit of time in November 8 points from thence, sir, they're rebuilding the media, sir a lot of your new Washington Times/them, when if ever to have said I said what I can it, and I wouldn't be talking very nice girl, she can she said it's


In [28]:
#Prompt Test 3
result = text_generator(test_prompt3, max_length=200, num_return_sequences=1,temperature=1)
print(f"Fine-tuned generated result: {test_prompt3}...{result[0]['generated_text']}")
#logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt3}...{result[0]['generated_text']}")

Fine-tuned generated result: What is the best way to cook a turkey?...What is the best way to cook a turkey? O you great spirit, there! No problem! https://t.co/ZXJlN4K2GxMv1wjA5S0. He did not come better look so proud??" So true! He would not to fix it. https://t.co/lNZtwZgVb"A"He6eV1yYt.2MtCWRy6KHLVT1jZl3W" So great to do.2K2y9"@s0D"  @politico? #MakeAmericaGreatAgain" https://t.JjW"  https://t.cooper at the beginningZl2Bj1yH3uMq1w9LqD2t.co/ZyD0Dms2Xl3lWjrqRrP9l4


In [20]:
from os import mkdir
mkdir('/content/final_model')

In [22]:
trainer.save_model('/content/final_model')

Non-default generation parameters: {'max_length': 50, 'do_sample': True}


In [23]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
!cp -r /content/final_model /content/drive/MyDrive

# Test model

In [25]:
model = AutoModelForCausalLM.from_pretrained('/content/final_model')

In [26]:
tokenizer = AutoTokenizer.from_pretrained('/content/final_model')

In [32]:
tokenizer.pad_token

'<pad>'

In [27]:
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [28]:
text_generator('Hi, how are you?')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hi, how are you? Sir Toby, tell me how I can. The same, that you do! I hope, it will be too long. Thank you! #NewYorkValues #Trump2016\nRegisterT_ https://t.co'}]

In [30]:
text_generator('What are you doing?')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'What are you doing? I mean but they are going. They should not give them gold, but by our new, in my bed--so? It shall be too far more! Thanks. @MichaelCohen212! https://t.co'}]

In [31]:
text_generator('To be or not to be')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "To be or not to be denied yourself, but, nor the king's a fool. When dinner comes. O! you too. I love me. Good Lord Timon. Now to have no credibility!    I love them.' What"}]

In [33]:
text_generator('Make America')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Make America great again! Thank you. Thank you. #AmericaFirst https://t.co/3YDZWOl2O http://t.co/yG4KEzbQq https://t.co/e'}]