In [1]:
# Step 1: Install Required Dependencies
!pip install transformers datasets accelerate



In [2]:
# Step 2: Import Required Libraries
import torch
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset
import logging
from dataclasses import dataclass

In [3]:
# Step 3: Setup Basic Logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)

In [4]:
# Step 4: Define Configuration
@dataclass
class DPOConfig:
    """Configuration for DPO training"""
    model_name: str = "distilbert-base-uncased"
    max_length: int = 128
    train_batch_size: int = 16
    eval_batch_size: int = 16
    learning_rate: float = 2e-5
    num_epochs: int = 1
    weight_decay: float = 0.01
    num_samples: int = 500
    test_size: float = 0.2
    output_dir: str = "./results"
    logging_dir: str = "./logs"
    logging_steps: int = 10

In [6]:
# Step 5: Load and Prepare Dataset
def load_dataset_step():
    logger.info("Loading dataset...")
    dataset = load_dataset('imdb', split=f'train[:500]')
    logger.info(f"Loaded {len(dataset)} examples")
    return dataset

In [7]:
# Step 6: Initialize Model and Tokenizer
def init_model_tokenizer(config):
    logger.info(f"Loading model and tokenizer: {config.model_name}")
    
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    
    # Initialize model
    model = AutoModelForSequenceClassification.from_pretrained(
        config.model_name,
        num_labels=2
    )
    
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    logger.info(f"Using device: {device}")
    return model, tokenizer

In [8]:
# Step 7: Preprocess Dataset
def preprocess_dataset(dataset, tokenizer, config):
    logger.info("Tokenizing dataset...")
    
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            truncation=True,
            padding=True,
            max_length=config.max_length
        )

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset



In [9]:
# Step 8: Split Dataset
def split_dataset(tokenized_dataset, config):
    logger.info(f"Splitting dataset with test_size={config.test_size}")
    split = tokenized_dataset.train_test_split(test_size=config.test_size)
    return split['train'], split['test']



In [10]:
# Step 9: Setup Training Arguments
def setup_training_args(config):
    return TrainingArguments(
        output_dir=config.output_dir,
        evaluation_strategy="epoch",
        learning_rate=config.learning_rate,
        per_device_train_batch_size=config.train_batch_size,
        per_device_eval_batch_size=config.eval_batch_size,
        num_train_epochs=config.num_epochs,
        weight_decay=config.weight_decay,
        logging_dir=config.logging_dir,
        logging_steps=config.logging_steps,
        report_to="none"  # Disable wandb logging
    )



In [11]:
# Step 10: Main Training Function
def train_model():
    # Initialize configuration
    config = DPOConfig()
    
    try:
        # Load dataset
        dataset = load_dataset_step()
        
        # Initialize model and tokenizer
        model, tokenizer = init_model_tokenizer(config)
        
        # Preprocess dataset
        tokenized_dataset = preprocess_dataset(dataset, tokenizer, config)
        
        # Split dataset
        train_dataset, test_dataset = split_dataset(tokenized_dataset, config)
        
        # Setup training arguments
        training_args = setup_training_args(config)
        
        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
        )
        
        # Train model
        logger.info("Starting training...")
        train_results = trainer.train()
        logger.info("Training completed")
        
        # Evaluate model
        logger.info("Evaluating model...")
        eval_results = trainer.evaluate()
        logger.info("Evaluation completed")
        
        # Print results
        print("\nTraining Results:")
        print(f"Training Loss: {train_results.training_loss:.4f}")
        print(f"Training Runtime: {train_results.metrics['train_runtime']:.2f}s")
        print("\nEvaluation Results:")
        print(f"Eval Loss: {eval_results['eval_loss']:.4f}")
        print(f"Eval Runtime: {eval_results['eval_runtime']:.2f}s")
        
        # Save model
        logger.info("Saving model and tokenizer...")
        model.save_pretrained(f"{config.output_dir}/saved_model")
        tokenizer.save_pretrained(f"{config.output_dir}/saved_model")
        logger.info("Model and tokenizer saved")
        
        return {
            "train_results": train_results,
            "eval_results": eval_results
        }
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise



In [12]:
# Step 11: Run Training
if __name__ == "__main__":
    results = train_model()

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.1331,0.068038



Training Results:
Training Loss: 0.2364
Training Runtime: 5.24s

Evaluation Results:
Eval Loss: 0.0680
Eval Runtime: 0.21s
