# Model Training for Data Cleaning Pipeline

This notebook covers the training and fine-tuning of models used in our data cleaning pipeline. We'll focus on:
1. Loading and preparing the data
2. Setting up the models
3. Training the development status classifier
4. Model evaluation and validation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import yaml
import logging
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    logger.info("Using GPU for training")
else:
    device = torch.device("cpu") 
    logger.info("Using CPU for training")

# Ensure required directories exist
Path("../models/cache").mkdir(parents=True, exist_ok=True)
Path("../logs").mkdir(parents=True, exist_ok=True)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [None]:
# Load configuration
def load_config():
    try:
        config_path = Path('../configs/model_config.yaml')
        if not config_path.exists():
            raise FileNotFoundError(f"Config file not found at {config_path}")
            
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)
            
        # Validate required config sections
        required_sections = ['model_params', 'optimization', 'checkpoint', 
                           'logging', 'evaluation', 'early_stopping']
        for section in required_sections:
            if section not in config:
                raise KeyError(f"Missing required config section: {section}")
                
        # Validate model params
        model_params = config['model_params']
        if not isinstance(model_params.get('device'), str):
            raise ValueError("device must be a string ('cuda' or 'cpu')")
        if not isinstance(model_params.get('batch_size'), int):
            raise ValueError("batch_size must be an integer")
            
        return config
        
    except Exception as e:
        logger.error(f"Error loading config: {str(e)}")
        raise

try:
    config = load_config()
    logger.info("Configuration loaded successfully!")
except Exception as e:
    logger.error(f"Failed to load configuration: {str(e)}")
    raise

## 1. Data Loading and Preparation

In [None]:
def load_and_prepare_data(file_path):
    """Load and prepare the dataset for model training"""
    try:
        # Validate file path
        if not Path(file_path).exists():
            raise FileNotFoundError(f"Data file not found at {file_path}")
            
        # Load data
        df = pd.read_csv(file_path)
        
        # Validate required columns
        required_cols = ['text', 'hashtags', 'place_country_code', 'Developed / Developing']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
        
        # Basic preprocessing
        df['text'] = df['text'].fillna('').astype(str)
        df['hashtags'] = df['hashtags'].fillna('').astype(str)
        df['place_country_code'] = df['place_country_code'].fillna('UNK')  # Match config default
        
        # Validate development status values
        df['development_status'] = df['Developed / Developing'].str.title()
        valid_statuses = ['Developed', 'Developing']
        invalid_statuses = df[~df['development_status'].isin(valid_statuses)]['development_status'].unique()
        if len(invalid_statuses) > 0:
            logger.warning(f"Found invalid development statuses: {invalid_statuses}")
            df['development_status'] = df['development_status'].map(
                lambda x: x if x in valid_statuses else None
            )
        
        # Create binary labels for development status
        df['label'] = df['development_status'].map({
            'Developed': 1,
            'Developing': 0
        })
        
        # Drop rows with invalid labels
        df = df.dropna(subset=['label'])
        
        if len(df) == 0:
            raise ValueError("No valid records remaining after preprocessing")
            
        return df
        
    except Exception as e:
        logger.error(f"Error loading and preparing data: {str(e)}")
        raise

try:
    # Load the dataset
    df = load_and_prepare_data('../data/raw/zero_waste.csv')
    logger.info(f"Loaded {len(df)} records")
    print(f"Loaded {len(df)} records")
    print("\nSample data:")
    display(df.head())
except Exception as e:
    logger.error(f"Failed to load and prepare data: {str(e)}")
    raise

## 2. Model Setup and Data Preparation

In [None]:
# Import required libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
import torch
import logging

logger = logging.getLogger(__name__)

class ModelTrainer:
    def __init__(self, config):
        """Initialize the model trainer with configuration"""
        self.config = config
        self.device = torch.device(config['model_params']['device'])
        self.setup_models()
        
    def setup_models(self):
        """Initialize models and tokenizers"""
        try:
            # Development status model
            model_config = self.config['model_params']['development_status']
            
            # Initialize tokenizer
            self.dev_status_tokenizer = AutoTokenizer.from_pretrained(
                model_config['model_name'],
                cache_dir=model_config['cache_dir'],
                use_fast=True  # Use fast tokenizer for better performance
            )
            
            # Initialize model
            self.dev_status_model = AutoModelForSequenceClassification.from_pretrained(
                model_config['model_name'],
                num_labels=model_config['num_labels'],
                cache_dir=model_config['cache_dir'],
                problem_type="single_label_classification"
            ).to(self.device)
            
            logger.info("Models initialized successfully")
            
        except Exception as e:
            logger.error(f"Error initializing models: {str(e)}")
            raise
        
    def prepare_dataset(self, df, text_col, label_col):
        """Prepare dataset for training"""
        try:
            # Validate inputs
            if text_col not in df.columns or label_col not in df.columns:
                raise ValueError(f"Required columns {text_col} and/or {label_col} not found in dataframe")
                
            # Create features
            features = [{
                'text': str(row[text_col]),
                'label': int(row[label_col])
            } for _, row in df.iterrows()]
            
            # Convert to Dataset format
            dataset = Dataset.from_list(features)
            
            # Tokenize function
            def tokenize_function(examples):
                return self.dev_status_tokenizer(
                    examples['text'],
                    padding='max_length',
                    truncation=True,
                    max_length=self.config['model_params']['max_length'],
                    return_tensors=None  # Return as lists
                )
            
            # Tokenize dataset
            tokenized_dataset = dataset.map(
                tokenize_function,
                batched=True,
                remove_columns=dataset.column_names,
                desc="Tokenizing dataset"
            )
            
            logger.info(f"Dataset prepared successfully with {len(tokenized_dataset)} samples")
            return tokenized_dataset
            
        except Exception as e:
            logger.error(f"Error preparing dataset: {str(e)}")
            raise

try:
    # Initialize trainer
    trainer = ModelTrainer(config)
    print("Model trainer initialized successfully!")
except Exception as e:
    logger.error(f"Failed to initialize model trainer: {str(e)}")
    raise

## 3. Data Splitting and Tokenization

In [None]:
# Import required libraries
from sklearn.model_selection import train_test_split

# Split data with stratification to maintain class distribution
train_df, eval_df = train_test_split(
    df,
    test_size=0.2, 
    random_state=config['model_params']['seed'],
    stratify=df['label']
)

# Verify split sizes
logger.info(f"Training set size: {len(train_df)}")
logger.info(f"Evaluation set size: {len(eval_df)}")

try:
    # Prepare datasets
    train_dataset = trainer.prepare_dataset(train_df, 'text', 'label')
    eval_dataset = trainer.prepare_dataset(eval_df, 'text', 'label')

    # Log dataset sizes
    print(f"Training samples: {len(train_dataset)}")
    print(f"Evaluation samples: {len(eval_dataset)}")

    # Validate datasets
    assert len(train_dataset) > 0, "Training dataset is empty"
    assert len(eval_dataset) > 0, "Evaluation dataset is empty"
    
except Exception as e:
    logger.error(f"Error preparing datasets: {str(e)}")
    raise

## 4. Training Setup

In [None]:
def setup_training(config, train_dataset, eval_dataset):
    """Setup training arguments and trainer for development status classification"""
    try:
        # Import required classes
        from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

        # Setup training arguments with values from config
        training_args = TrainingArguments(
            output_dir="../models/dev_status_model",
            num_train_epochs=config['model_params']['num_epochs'],
            per_device_train_batch_size=config['model_params']['batch_size'],
            per_device_eval_batch_size=config['model_params']['batch_size'],
            learning_rate=config['model_params']['learning_rate'],
            warmup_steps=config['model_params']['warmup_steps'],
            weight_decay=config['model_params']['weight_decay'],
            logging_dir=config['logging']['logging_dir'],
            logging_steps=config['logging']['logging_steps'],
            evaluation_strategy=config['evaluation']['eval_strategy'],
            save_strategy=config['checkpoint']['save_strategy'],
            load_best_model_at_end=config['checkpoint']['load_best_model_at_end'],
            metric_for_best_model=config['evaluation']['metric_for_best_model'],
            greater_is_better=config['evaluation']['greater_is_better'],
            gradient_accumulation_steps=config['model_params']['gradient_accumulation_steps'],
            fp16=config['optimization']['use_mixed_precision'],
            gradient_checkpointing=config['optimization']['use_gradient_checkpointing'],
            max_grad_norm=config['optimization']['max_grad_norm']
        )
        
        # Setup trainer with model and datasets
        model_trainer = Trainer(
            model=model,  # Use the model instance passed from outside
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,  # Use the tokenizer instance passed from outside
            data_collator=DataCollatorWithPadding(tokenizer)
        )
        
        logger.info("Training setup completed successfully")
        return model_trainer

    except Exception as e:
        logger.error(f"Error in training setup: {str(e)}")
        raise

# Setup training with error handling
try:
    model_trainer = setup_training(config, train_dataset, eval_dataset)
    logger.info("Training setup completed!")
except Exception as e:
    logger.error(f"Failed to setup training: {str(e)}")
    raise

## 5. Data Analysis and Validation

In [None]:
def analyze_data_distribution(df):
    """
    Analyze the distribution of labels and data characteristics.
    
    Args:
        df (pd.DataFrame): DataFrame containing 'text' and 'development_status' columns
        
    Returns:
        None - Displays plots and prints statistics
    """
    # Import required libraries if not already imported
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Validate input data
    required_columns = ['text', 'development_status'] 
    if not all(col in df.columns for col in required_columns):
        raise ValueError("DataFrame must contain 'text' and 'development_status' columns")
    
    # Create figure for label distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x='development_status')
    plt.title('Distribution of Development Status Labels')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Calculate text lengths
    df['text_length'] = df['text'].astype(str).str.len()
    
    # Create figure for text length distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='text_length', bins=50)
    plt.title('Distribution of Text Length')
    plt.xlabel('Number of Characters')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("\nDevelopment Status Distribution:")
    status_dist = df['development_status'].value_counts(normalize=True)
    for status, pct in status_dist.items():
        print(f"{status}: {pct:.2%}")
    
    print("\nText Length Statistics:")
    length_stats = df['text_length'].describe()
    print(f"Mean length: {length_stats['mean']:.1f} characters")
    print(f"Median length: {length_stats['50%']:.1f} characters")
    print(f"Min length: {length_stats['min']:.0f} characters")
    print(f"Max length: {length_stats['max']:.0f} characters")

# Analyze data
try:
    analyze_data_distribution(df)
except Exception as e:
    print(f"Error analyzing data distribution: {str(e)}")