In [1]:
import os

In [2]:
%pwd

'/Users/satwik/Downloads/MLproj/airlines_sentiment_classification/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/satwik/Downloads/MLproj/airlines_sentiment_classification'

In [84]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    datasets_dir: Path
    base_model_path: Path
    output_dir: Path
    model_save_path: Path
    num_train_epochs: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    warmup_steps: int
    weight_decay: float
    max_steps: int
    save_steps: int
    logging_steps: int
    


In [85]:
from pathlib import Path

CONFIG_FILE_PATH = Path("/Users/satwik/Downloads/MLproj/airlines_sentiment_classification/config/config.yaml")
PARAMS_FILE_PATH = Path("/Users/satwik/Downloads/MLproj/airlines_sentiment_classification/params.yaml")

In [86]:
print(CONFIG_FILE_PATH)

/Users/satwik/Downloads/MLproj/airlines_sentiment_classification/config/config.yaml


In [87]:
import os
os.path.join('usr', 'bin', 'spam')

'usr/bin/spam'

In [88]:
from airlinesSentiment.constants import *
from airlinesSentiment.utils.common import read_yaml, create_directories




In [94]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,
            params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        params = self.params
        training_data = os.path.join(self.config.feature_engineering.datasets_dir, "datasets")
        base_model = self.config.prepare_model
        base_model = os.path.join(self.config.prepare_model.base_model_path, "prepare_model")


        #create directories if they don't exist
        create_directories([
            Path(training.root_dir),
            Path(training.model_save_path)

        ])

        training_config = TrainingConfig(
            datasets_dir=Path(training_data),  #Update this to point to feature_engineering
            base_model_path=Path(base_model),
            output_dir=Path(training.root_dir),
            model_save_path=Path(training.model_save_path),
            num_train_epochs=params.num_train_epochs,
            per_device_train_batch_size=params.per_device_train_batch_size,
            per_device_eval_batch_size=params.per_device_eval_batch_size,
            warmup_steps=params.warmup_steps,
            weight_decay=params.weight_decay,
            max_steps=params.max_steps,
            save_steps=params.save_steps,
            logging_steps=params.logging_steps
        )

        return training_config
    
    

In [90]:
from transformers import (
    BertForSequenceClassification,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    AutoTokenizer
)

from pathlib import Path
import torch
from airlinesSentiment import logger

# model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

class ModelTraining:
    def __init__(self, config:TrainingConfig):
        self.config = config

    def load_datasets(self):
        """ Loads the train, validation and test datasets from the specified directory
        Returns:
            train_datasets, val_datasets, test_datasets: Loaded datasets"""
        datasets = Path("artifacts/feature_engineering/datasets")
        print(self.config.datasets_dir)
        print(datasets)

        train_datasets = torch.load(datasets / "train_dataset.pt", weights_only=False)
        val_datasets = torch.load(datasets / "val_dataset.pt", weights_only=False)
        test_datasets = torch.load(datasets / "test_dataset.pt", weights_only=False)

        logger.info(f"Datasets loaded from {datasets}")
        return train_datasets, val_datasets, test_datasets
    

    def train(self):
        #load datasets 
        train_dataset, val_dataset, _ = self.load_datasets()

        #initialize the model
        model = BertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



        # model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

        #set up training aruguments
        training_args = TrainingArguments(
            output_dir=self.config.outputs_dir,
            num_train_epochs=self.config.num_train_epochs,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            warmup_steps=self.config.warmup_steps,
            weight_decay=self.config.weight_decay,
            max_steps=self.config.max_steps,
            save_steps=self.config.save_steps,
            evaluation_strategy="epoch",
            # load_best_model_at_end=True

        )

        # Initialize the trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator
        )

        trainer.train()

        self.save_model(trainer)


    def save_model(self, trainer):
        """ Saves the trainer model and tokenizer to the specified directoty
        Args:
            trainer(Trainer): The Trainer object containing the trained model"""
        
        save_path = Path(self.config.model_save_path)
        save_path.mkdir(parents=True, exist_ok=True)

        trainer.save_model(save_path)
        logger.info(f"Model saved to {save_path}")


        # # Save the tokenizer
        # tokenizer = trainer.tokenizer
        # if tokenizer is not None:
        #     tokenizer.save_pretrained(save_path)
        #     logger.info(f"Tokenizer saved to {save_path}")

        # else:
        #     logger.warning("Tokenizer not found  in the trainer object. Only the model was saved")

        # Save the processing class(e.g Tokenizer)
        processing_class = getattr(trainer, "processing_class", None)
        if processing_class is not None:
            processing_class.save_pretrained(save_path)
            logger.info(f"Processing class (e.g tokenizer) saved to path {save_path}")

        else:
            logger.warning("Processing class not found. Only the model was saved ")

    
    def evaluate(self):
        """
        Evaluate the model on the test dataset"""

        # Load, the test dataset
        _, _, test_dataset = self.load_datasets()

        # Load the trained model
        # model = AutoModelForSequenceClassification.from_pretrained(self.config.model_save_path)
        model = AutoModelForSequenceClassification.from_pretrained(self.config.model_save_path)

        # Set up training arguments for evaluation
        training_args = TrainingArguments(
            output_dir=self.config.outputs_dir,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size
        )

        # Set up taining arguments for evaluation
        training_args = TrainingArguments(
            output_dir=self.config.outputs_dir,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size
        )

        # Initialize the Trainer for evaluation
        training_args = TrainingArguments(
            output_dir=self.config.outputs_dir,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size
        )

        # Initialize the Trainer for evaluation 

        trainer = Trainer(
            model = model,
            args = training_args
        )

        # Evaluate the model
        results = trainer.evaluate(test_dataset)

        logger.info("Evaluation Results:")
        logger.info("Evaluation Results: ", results)
        logger.info(f"     - Loss: {results['eval_loss']:.4f}")
        logger.info(f"     - Runtime: {results['eval_runtime']:.2f} seconds")
        logger.info(f"     - Samples per Second: {results['eval_samples_per_second']:.2f}")
        logger.info(f"     - Steps per Second: {results['eval_steps_per_second']:.2f}")
        logger.info(f"     - Epoch: {results.get('epoch', 'N/A')}")

        

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    AutoTokenizer
)

# from transformers import BertForSequenceClassification


from pathlib import Path
import torch 
from airlinesSentiment import logger

class ModelTraining:
    def __init__(self, config: TrainingConfig):
        """ Initialize the ModelTraining class.

        Args:
        config (TrainingConfig): Configuration for Model Training
        """

        self.config = config

    def load_datasets(self):
        """ 
        Loads the train, validation and test datasets from the specified directory"""


        #Explicitly set the corrupt path
        datasets = Path("artifacts/feature_engineering/datasets")

        #Debug: print the datasets directory
        print("Datasets Directory:", datasets)

        #Load Datasets
        train_dataset = torch.load(datasets / "train_dataset.pt", weights_only=False)
        val_dataset = torch.load(datasets / "val_dataset.pt", weights_only=False)
        test_dataset = torch.load(datasets / "test_dataset.pt", weights_only=False)

        logger.info(f"Datasets loaded from {datasets}")
        return train_dataset, val_dataset, test_dataset
    
    def train(self):
        """ Trains the model using laoded dataset"""

        # Load datasets 
        train_dataset, val_dataset, _ = self.load_datasets()

        # Load the base model and tokenizer from the artifacts/ prepare base model folder 
        #base_model_path = Path(self.config.base_model_path)
        base_model_path = Path("artifacts/prepare_model")
        model = AutoModelForSequenceClassification.from_pretrained(base_model_path)
        tokenizer = AutoTokenizer.from_pretrained(base_model_path)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


        # set up training arguments
        training_args = TrainingArguments(
            output_dir=self.config.output_dir,
            num_train_epochs=self.config.num_train_epochs,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            warmup_steps=self.config.warmup_steps,
            weight_decay=self.config.weight_decay,
            max_steps=self.config.max_steps,
            save_steps=self.config.save_steps,
            evaluation_strategy="epoch",

        )

        #Initialize the trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator
        )

        #start training
        trainer.train()

        #save the trained model

        self.save_model(trainer)

    
    def save_model(self, trainer):
        """ 
        Saves the trained model and tokenizer to the specified directory.
        Args:
            trainer(Trainer): The trainer object containing the trained model"""
        
        save_path = Path(self.config.model_save_path)
        save_path.mkdir(parents=True, exist_ok=True)

        # This is the single, correct line to save both the model and tokenizer
        trainer.save_model(save_path)
    
        logger.info(f"Model saved to {save_path}")


        # Save the model
        tokenizer = trainer.tokenizer
        if tokenizer is not None:
            tokenizer.save_pretrained(save_path)
            logger.info(f"Tokenizer saved to {save_path}")
        else:
            logger.warning("Tokenizer not found in the trainer object. Only the model was saved")

    
    def evaluate(self):
        """ 
        Evaluate the model on the test dataset."""

        # Load the test dataset
        _, _, test_dataset = self.load_datasets()

        #load the trained model from the model_save_path
        model = AutoModelForSequenceClassification.from_pretrained(self.config.model_save_path)

        #Set up training arguments for evaluation
        training_args = TrainingArguments(
            output_dir=self.config.output_dir,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size
        )

        #Initialize the Trainer for evaluation
        trainer = Trainer(
            model=model,
            args=training_args
        )

        #Evaluate the model
        results = trainer.evaluate(test_dataset)

        logger.info("Evaluation Results: ")
        logger.info(f"     - Loss: {results['eval_loss']:.4f}")
        logger.info(f"     - Runtime: {results['eval_runtime']:.2f} Seconds")
        logger.info(f"     - Samples per Second: {results['eval_samples_per_second']:.2f}")
        logger.info(f"     - Steps per Second: {results['eval_steps_per_second']:.2f}")
        logger.info(f"     - Epoch: {results.get('epoch', 'N/A')}")

In [108]:
from accelerate import PartialState
accelerator_state_kwargs = {"enabled": True, "use_configured_state": False}


# Initialize Partialstate

# from  airlinesSentiment.components.feature_engineering import SentimentDataset
if __name__ == "__main__":
    #Initialize the configuration manager
    config_manager = ConfigurationManager()

    #get the model trainig config 
    training_config = config_manager.get_training_config()

    #Initialize the ModelTraining
    model_training = ModelTraining(config=training_config)

    #Train the model
    model_training.train()

    #Evaluate the model
    model_training.evaluate()

    partial_state = PartialState()




[2025-08-11 02:35:11,021: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-08-11 02:35:11,027: INFO: common: yaml file: params.yaml loaded successfully]
[2025-08-11 02:35:11,030: INFO: common: created directory at: artifacts]
[2025-08-11 02:35:11,031: INFO: common: created directory at: artifacts/training]
[2025-08-11 02:35:11,032: INFO: common: created directory at: artifacts/training/trained_model]
Datasets Directory: artifacts/feature_engineering/datasets
[2025-08-11 02:35:11,482: INFO: 422766164: Datasets loaded from artifacts/feature_engineering/datasets]


max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1098 [00:00<?, ?it/s]

{'eval_loss': 1.030842900276184, 'eval_runtime': 19.268, 'eval_samples_per_second': 113.971, 'eval_steps_per_second': 56.986, 'epoch': 0.0}
{'train_runtime': 65.3165, 'train_samples_per_second': 0.306, 'train_steps_per_second': 0.153, 'train_loss': 1.0909061431884766, 'epoch': 0.0}
[2025-08-11 02:36:22,103: INFO: 422766164: Model and tokenizer saved to artifacts/training/trained_model]
[2025-08-11 02:36:22,118: INFO: 422766164: Tokenizer saved to artifacts/training/trained_model]
Datasets Directory: artifacts/feature_engineering/datasets
[2025-08-11 02:36:22,753: INFO: 422766164: Datasets loaded from artifacts/feature_engineering/datasets]


  0%|          | 0/1098 [00:00<?, ?it/s]

[2025-08-11 02:36:40,369: INFO: 422766164: Evaluation Results: ]
[2025-08-11 02:36:40,370: INFO: 422766164:      - Loss: 1.0390]
[2025-08-11 02:36:40,372: INFO: 422766164:      - Runtime: 16.50 Seconds]
[2025-08-11 02:36:40,372: INFO: 422766164:      - Samples per Second: 133.09]
[2025-08-11 02:36:40,374: INFO: 422766164:      - Steps per Second: 66.55]
[2025-08-11 02:36:40,375: INFO: 422766164:      - Epoch: N/A]
