In [None]:
# 1Ô∏è‚É£ Install Libraries
!pip install accelerate==0.34.2 transformers==4.44.2 datasets==2.20.0 torch==2.3.1 -U

In [None]:


# 2Ô∏è‚É£ Imports and Configuration
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch

MODEL_NAME = "distilgpt2"
BLOCK_SIZE = 128 # The fixed length we want for every training example

print("‚è≥ Loading Model and Tokenizer...")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set the pad_token for batching (standard for GPT-like models)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

print("‚úÖ Model loaded.")

In [None]:
# 3Ô∏è‚É£ Load Simple Text Dataset (Wikitext-2)
print("‚è≥ Loading Wikitext-2 dataset...")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Remove unnecessary columns and tokenize the raw text
def tokenize_function(examples):
    # Process the 'text' field
    return tokenizer(examples["text"], return_special_tokens_mask=True)

tokenized_datasets = dataset.map(
    tokenize_function, 
    batched=True, 
    num_proc=4, # Use multiple processes for faster tokenization
    remove_columns=dataset["train"].column_names
)

# 4Ô∏è‚É£ Group Texts into Fixed Blocks
def group_texts(examples):
    # Concatenate all texts into one long list of tokens
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    # Drop the last partial block
    total_length = (total_length // BLOCK_SIZE) * BLOCK_SIZE
    
    # Split the long list into chunks of BLOCK_SIZE
    result = {
        k: [t[i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
        for k, t in concatenated_examples.items()
    }
    # Create labels: for Causal LM, the input sequence is shifted to become the label
    result["labels"] = result["input_ids"].copy()
    return result

# Apply the grouping function
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
)

# 5Ô∏è‚É£ Data Collator (Ensures MLM is False for Causal LM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print(f"üìù Final training sample size: {len(lm_datasets['train'])}")
print("‚úÖ Data preparation complete.")

In [None]:
# Use a very small subset for a fast demonstration run (around 100 steps)
train_subset = lm_datasets["train"].select(range(2000))
eval_subset = lm_datasets["validation"].select(range(500))

# 6Ô∏è‚É£ Define Training Arguments (Minimal settings)
OUTPUT_DIR = "./simple_clm_results"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,          # Only 1 epoch
    per_device_train_batch_size=8,
    logging_steps=50,            # Log loss every 50 steps
    save_strategy="no",          # Don't save checkpoints during this quick run
    report_to="none"
)

# 7Ô∏è‚É£ Initialize and Train the Trainer
print("üöÄ Initializing Trainer and starting fine-tuning...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    data_collator=data_collator,
)

trainer.train()

print("‚úÖ Fine-tuning complete!")

# 8Ô∏è‚É£ Test Inference
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

prompt = "The primary goal of machine learning is to"
print(f"\nInput Prompt: {prompt}\n")

output = generator(
    prompt,
    max_length=50,
    num_return_sequences=1,
    do_sample=True, 
    temperature=0.8
)

print("üìù Generated Text:")
print(output[0]['generated_text'])