In [1]:
# SECTION 1: INSTALL & SETUP
!pip install -q transformers torch accelerate peft datasets pandas scikit-learn

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from sklearn.model_selection import train_test_split

print("âœ… Setup complete")

âœ… Setup complete


In [2]:
# SECTION 2: UPLOAD & PREPARE DATA
from google.colab import files
import io

uploaded = files.upload()
df = pd.read_csv('/content/cleaned_conversations.csv')
print(f"ðŸ“Š Loaded {len(df)} conversations")

# Split data
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(f"ðŸŽ¯ Train: {len(train_df)}, Val: {len(val_df)}")

Saving cleaned_conversations.csv to cleaned_conversations (1).csv
ðŸ“Š Loaded 121838 conversations
ðŸŽ¯ Train: 109654, Val: 12184


In [3]:
# SECTION 3: LOAD FAST MODEL
MODELS = {
    "distilgpt2": "distilbert/distilgpt2",        # âš¡ Fastest
    "tiny-llama": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # âš¡ Balanced
}

tokenizer = AutoTokenizer.from_pretrained(MODELS["distilgpt2"])
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODELS["distilgpt2"],
    torch_dtype=torch.float16,
    device_map="auto"
)
print("âœ… Model loaded")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


âœ… Model loaded


In [8]:
# SECTION 4: TRAINING SETUP (OPTIMIZED FOR SPEED)
def format_prompt(row):
    return f"Human: {row['input']}\nAssistant: {row['response']}"

def tokenize_data(examples):
    texts = [format_prompt({'input': i, 'response': r})
             for i, r in zip(examples['input'], examples['response'])]

    tokenized = tokenizer(
        texts,
        truncation=True,
        padding='max_length',  # Explicitly pad to max_length
        max_length=256,  # Shorter for faster training
        return_tensors="pt",
        return_attention_mask=True
    )

    # Ensure labels are also padded and truncated
    tokenized["labels"] = tokenized["input_ids"].clone()

    # Ensure all items in the dictionary are tensors
    return {k: torch.tensor(v) for k, v in tokenized.items()}


# Prepare datasets (faster processing)
train_dataset = Dataset.from_pandas(train_df).map(tokenize_data, batched=True, batch_size=1000)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_data, batched=True, batch_size=1000)

Map:   0%|          | 0/109654 [00:00<?, ? examples/s]

Map:   0%|          | 0/12184 [00:00<?, ? examples/s]

In [9]:
# SECTION 5: FAST FINE-TUNING
# LoRA Configuration - Minimal for speed
lora_config = LoraConfig(
    r=8,                    # Smaller rank
    lora_alpha=16,
    target_modules=["c_attn", "c_proj", "c_fc"], # Corrected target modules for DistilGPT2
    lora_dropout=0.01,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# Training Arguments - Optimized for speed
training_args = TrainingArguments(
    output_dir="./fine-tuned-chatbot",
    per_device_train_batch_size=8,    # Larger batches
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,    # No accumulation
    learning_rate=1e-4,               # Higher learning rate
    num_train_epochs=2,               # Fewer epochs
    logging_steps=20,
    eval_steps=50,
    save_steps=100,
    fp16=True,
    dataloader_pin_memory=False,      # Faster data loading
    report_to="none"
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

print("ðŸš€ Starting fast training...")
trainer.train()

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


ðŸš€ Starting fast training...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,7.9224
40,1.6429
60,0.4901
80,0.4468
100,0.4226
120,0.3917
140,0.381
160,0.3864
180,0.3506
200,0.3413


TrainOutput(global_step=27414, training_loss=0.2655754121534237, metrics={'train_runtime': 4035.8822, 'train_samples_per_second': 54.34, 'train_steps_per_second': 6.793, 'total_flos': 1.4524803339780096e+16, 'train_loss': 0.2655754121534237, 'epoch': 2.0})

In [10]:
# SECTION 6: SAVE & DOWNLOAD
trainer.save_model()
tokenizer.save_pretrained("./fine-tuned-chatbot")

from google.colab import files
!zip -r fine-tuned-chatbot.zip fine-tuned-chatbot/
files.download('fine-tuned-chatbot.zip')
print("âœ… Model downloaded")

  adding: fine-tuned-chatbot/ (stored 0%)
  adding: fine-tuned-chatbot/checkpoint-6500/ (stored 0%)
  adding: fine-tuned-chatbot/checkpoint-6500/tokenizer_config.json (deflated 54%)
  adding: fine-tuned-chatbot/checkpoint-6500/vocab.json (deflated 59%)
  adding: fine-tuned-chatbot/checkpoint-6500/adapter_model.safetensors (deflated 7%)
  adding: fine-tuned-chatbot/checkpoint-6500/optimizer.pt (deflated 8%)
  adding: fine-tuned-chatbot/checkpoint-6500/scheduler.pt (deflated 61%)
  adding: fine-tuned-chatbot/checkpoint-6500/scaler.pt (deflated 64%)
  adding: fine-tuned-chatbot/checkpoint-6500/trainer_state.json (deflated 78%)
  adding: fine-tuned-chatbot/checkpoint-6500/special_tokens_map.json (deflated 60%)
  adding: fine-tuned-chatbot/checkpoint-6500/training_args.bin (deflated 53%)
  adding: fine-tuned-chatbot/checkpoint-6500/tokenizer.json (deflated 82%)
  adding: fine-tuned-chatbot/checkpoint-6500/adapter_config.json (deflated 57%)
  adding: fine-tuned-chatbot/checkpoint-6500/merges

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

âœ… Model downloaded


In [11]:
# SECTION 7: TEST
def chat(message):
    prompt = f"Human: {message}\nAssistant:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=80,
        temperature=0.7,
        do_sample=True
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("Assistant:")[-1].strip()

print("ðŸ¤– Testing chatbot:")
print(chat("Hello!"))

ðŸ¤– Testing chatbot:
good thanks
