In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers datasets accelerate -q

In [None]:
from datasets import load_dataset

# Load a small 1% subset
dataset = load_dataset("openwebtext", split="train[:0.1%]", trust_remote_code=True)
print(dataset[0])


In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad_token

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


In [None]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-openwebtext",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=1,
    logging_steps=10,               # Show progress every 10 steps
    logging_dir='./logs',
    report_to="none",               # Avoid integration with WandB or TensorBoard
    disable_tqdm=False,             # Enable tqdm progress bar in notebook
    fp16=False,                     # True if using GPU with FP16 support
    logging_first_step=True,        # Show log for first step too
)


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


In [None]:
model.save_pretrained("./gpt2-finetuned-openwebtext")
tokenizer.save_pretrained("./gpt2-finetuned-openwebtext")


In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2-finetuned-openwebtext", tokenizer="./gpt2-finetuned-openwebtext")

prompt = "In a distant future where AI and humans"
result = generator(prompt, max_length=100, num_return_sequences=1)
print(result[0]['generated_text'])
