In [10]:
!pip install transformers datasets torch



In [11]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
print("Libraries imported successfully")

Libraries imported successfully


In [12]:
text_data = """
Artificial intelligence is transforming the world.
Machine learning helps computers learn from data.
Deep learning uses neural networks to solve problems.
AI is used in healthcare, education, and finance.
AI improves automation and decision making.
"""
with open("train.txt", "w") as f:
    f.write(text_data)
print("Dataset file created")

Dataset file created


In [15]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")
print("GPT-2 model and tokenizer loaded")

GPT-2 model and tokenizer loaded


In [5]:
from datasets import load_dataset
dataset = load_dataset("text", data_files={"train": "train.txt"})
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print("Dataset loaded and tokenized")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Dataset loaded and tokenized


In [14]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_steps=10
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)
trainer.train()
print("Training completed")



Step,Training Loss


Training completed


In [16]:
prompt = "Artificial intelligence will"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(
    inputs["input_ids"],
    max_length=80,
    num_return_sequences=1
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:\n")
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:

Artificial intelligence will be able to do things like search for and find people, and to do things like find out who's in the right place at the right time.

"We're going to be able to do things like that, and we're going to be able to do things like that, and we're going to be able to do things like that, and we're going to


In [17]:
with open("generated_text.txt", "w") as f:
    f.write(generated_text)
print("Output saved to generated_text.txt")

Output saved to generated_text.txt
