In [1]:
!pip install -q transformers datasets nltk

In [22]:
import transformers
transformers.__version__

'4.53.1'

In [23]:
from datasets import Dataset
import nltk
import requests

nltk.download('punkt')
nltk.download('punkt_tab')

def load_pride_and_prejudice():
    url = "https://www.gutenberg.org/files/1342/1342-0.txt"
    raw_text = requests.get(url).text

    start = raw_text.lower().find("chapter i")
    end = raw_text.lower().find("*** end of the project gutenberg")
    text = raw_text[start:end]

    sentences = nltk.sent_tokenize(text)
    chunks = [' '.join(sentences[i:i+5]) for i in range(0, len(sentences), 5)]
    return Dataset.from_dict({"text": chunks})

dataset = load_pride_and_prejudice()
dataset


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Dataset({
    features: ['text'],
    num_rows: 938
})

In [24]:
from transformers import AutoTokenizer

model_checkpoint = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token  t

def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/938 [00:00<?, ? examples/s]

In [25]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="./gpt2-pride-model",
    save_strategy="no",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    logging_steps=10,
    logging_dir="./logs",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [26]:
trainer.train()

Step,Training Loss
10,5.2404
20,4.3845
30,3.7201
40,3.6129
50,3.4524
60,3.2285
70,3.3137
80,3.282
90,3.1992
100,3.1235


TrainOutput(global_step=2350, training_loss=2.361356172764555, metrics={'train_runtime': 419.1451, 'train_samples_per_second': 22.379, 'train_steps_per_second': 5.607, 'total_flos': 612729815040000.0, 'train_loss': 2.361356172764555, 'epoch': 10.0})

In [27]:
model.save_pretrained("gpt2-pride-model")
tokenizer.save_pretrained("gpt2-pride-model")

('gpt2-pride-model/tokenizer_config.json',
 'gpt2-pride-model/special_tokens_map.json',
 'gpt2-pride-model/vocab.json',
 'gpt2-pride-model/merges.txt',
 'gpt2-pride-model/added_tokens.json',
 'gpt2-pride-model/tokenizer.json')

In [28]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load the model you just saved
model = AutoModelForCausalLM.from_pretrained("gpt2-pride-model")
tokenizer = AutoTokenizer.from_pretrained("gpt2-pride-model")

# Inference pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Prompt
prompt = "Elizabeth stood at the edge of the lake, wondering if the letter in her hand would change everything."
output = generator(prompt, max_length=30, temperature=0.9, top_k=50, do_sample=True)[0]['generated_text']

print("📜 Generated Text:\n", output)


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=30) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


📜 Generated Text:
 Elizabeth stood at the edge of the lake, wondering if the letter in her hand would change everything. Her eyes met
her eye again as she read, and she looked at Darcy with amazement. He
was not seen till after ten
minutes’ passed away; but she found that he was much more
very tired than she had ever seen him; and, after sitting a few
minutes, she set off again. Mrs. Gardiner was the first in calling to find Mr. Bennet there.
