In [2]:
# 1Ô∏è‚É£ Install Libraries (Same as before, good to keep)
!pip install accelerate==0.34.2 transformers==4.44.2 datasets==2.20.0 torch==2.3.1 -U

In [7]:
# 1. Uninstall the potentially conflicting packages
# '--yes' forces the uninstallation without prompting
!pip uninstall transformers accelerate datasets torch -y

# 2. Reinstall the necessary, compatible versions
# Use the exact, known-good versions:
!pip install accelerate==0.34.2 transformers==4.44.2 datasets==2.20.0 torch==2.3.1 -U

# 3. Import Check
# After the installation, you MUST restart your runtime (e.g., in Colab: Runtime -> Restart Session)
# Then, try re-running the modified code, starting with the imports.

print("‚úÖ Installation and reinstallation commands executed.")
print("üö® Please RESTART YOUR RUNTIME (e.g., Runtime > Restart session in Colab) and then run all cells again.")

In [None]:


# 2Ô∏è‚É£ Imports and Model/Tokenizer Loading (Modified)
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
# from transformers import login # Keep this if you plan to upload later

# We use the full 'gpt2' model for better results
MODEL_NAME = "gpt2"

# 3Ô∏è‚É£ Load Model and Tokenizer
print(f"‚è≥ Loading Model: {MODEL_NAME}...")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set the padding token. This is important for GPT-like models when batching sequences.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token # Use the End Of Sentence token as the pad token
    model.config.pad_token_id = model.config.eos_token_id

print("‚úÖ Model and Tokenizer loaded.")

In [None]:
# 4Ô∏è‚É£ Load New Dataset
from datasets import load_dataset
# Using 'daily_dialog' dataset for conversational training
print("‚è≥ Loading DailyDialog dataset...")
dataset = load_dataset("daily_dialog")
print(dataset) # Inspect the structure

# 5Ô∏è‚É£ Data Preprocessing and Tokenization (Modified)
MAX_LENGTH = 512 # Max length of the input sequence

def tokenize_function(examples):
    # The 'dialog' field is a list of strings (turns in a conversation).
    # We join them into one continuous text string for the Causal LM task.
    full_text = [" ".join(dialog_list) for dialog_list in examples["dialog"]]
    
    # Tokenize the combined text
    return tokenizer(full_text, 
                     truncation=True, 
                     max_length=MAX_LENGTH)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, 
                                 batched=True, 
                                 remove_columns=["dialog", "emotion", "sentiment"])
                                 
# Note: Since DailyDialog is large, we'll use a small subset for quick demonstration
# You can remove the .select(...) line for full training.
train_subset = tokenized_datasets["train"].select(range(5000)) # Use first 5000 examples
validation_subset = tokenized_datasets["validation"].select(range(500)) # Use first 500 examples

print(f"üìù Training on {len(train_subset)} samples.")

# 6Ô∏è‚É£ Data Collator (Same as before, ensuring mlm=False for Causal LM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print("‚úÖ Data preparation complete.")

In [None]:
# 7Ô∏è‚É£ Define Training Arguments (Modified)
REPO_NAME = "gpt2-dailydialog-finetuned-demo" # Unique name for your project
OUTPUT_DIR = "./results_dailydialog"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,  # Set to 1 epoch for a fast run, increase to 3-5 for better results
    per_device_train_batch_size=4, # Reduced batch size due to larger model (GPT-2)
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    logging_steps=50,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    # push_to_hub=True, # Uncomment if you want to push to Hub automatically
    # hub_model_id=REPO_NAME, # Your Hugging Face model repository name
    report_to="none"
)

# 8Ô∏è‚É£ Initialize and Train the Trainer (Using the subset data)
print("üöÄ Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=validation_subset,
    data_collator=data_collator,
    tokenizer=tokenizer # Pass the tokenizer for padding/saving purposes
)

print("üî• Starting fine-tuning...")
trainer.train()

print("‚úÖ Fine-tuning complete!")

In [None]:
# 9Ô∏è‚É£ Test the Fine-Tuned Model (Modified Prompt)
from transformers import pipeline

# Create a text generation pipeline using the fine-tuned model
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1 # Use GPU if available
)

# New conversational prompt
prompt = "Customer: I need to book a flight to London next week."
print(f"Input Prompt: {prompt}\n")

# Generate text
output = generator(
    prompt,
    max_length=80,
    num_return_sequences=1,
    do_sample=True, # Use sampling for more creative text
    temperature=0.7 # Control creativity (lower is more predictable)
)

print("üìù Generated Text:")
print(output[0]['generated_text'])

# üîü Save Locally (Recommended)
SAVE_PATH = "./gpt2-dailydialog-local"
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f"‚úÖ Model saved locally to: {SAVE_PATH}")