In [3]:
import os
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/facial_expression_detection/research'

In [4]:
os.chdir('../')
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/facial_expression_detection'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    models: Path
    figures: Path
    dataset_folder: Path
    dataset_labels: Path
    model_params: dict

In [6]:
from src.detmood.constant import *
from src.detmood.utils.main_utils import create_directories, read_yaml

class ConfigurationManager:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,
        schema_file_path = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)
        
        create_directories([self.config.artifacts_root])
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.model
        
        create_directories([config.models, config.figures])
        
        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            models=config.models,
            figures=config.figures,
            dataset_folder=config.dataset_folder,
            dataset_labels=config.dataset_labels,
            model_params=params
        )
        
        return model_trainer_config

In [9]:
from src.detmood.constant.dataset_preparation import CustomImageDataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from torchvision import transforms, models
from torchvision.models import EfficientNet_B0_Weights
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
    
    def dataset_folds_preparation(self):
        transform = transforms.Compose([
            transforms.Resize((
                self.config.model_params.img_in_size,
                self.config.model_params.img_in_size
            )),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        
        dataset = CustomImageDataset(
            self.config.dataset_labels,
            self.config.dataset_folder,
            transform=transform
        )
        
        skf = StratifiedKFold(
            n_splits=self.config.model_params.num_folds,
            shuffle=True,
            random_state=42
        )
        
        return dataset, skf
    
    def validation(
            self,
            device,
            fold,
            model,
            criterion,
            val_loader,
            val_losses,
            val_accuracies,
            epoch,
            best_val_loss
        ):
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            print('Validation process...')
            for images, labels in tqdm(val_loader):
                images, labels = images.to(device), labels.to(device)
                
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = 100 * correct / total
        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
    
            model_path = os.path.join(self.config.models, f'efficientnet_fold_{fold + 1}.pth')
            torch.save(model.state_dict(), model_path)
            print(f'Saved Best Model for Fold {fold + 1} at Epoch {epoch + 1}')
            
            cm = confusion_matrix(all_labels, all_preds)
            plt.figure(figsize=(10, 8))
            sns.heatmap(
                cm,
                annot=True,
                fmt='d',
                cmap='Blues',
                xticklabels=range(self.config.model_params.num_classes),
                yticklabels=range(self.config.model_params.num_classes)
            )
            plt.xlabel('Predicted Labels')
            plt.ylabel('True Labels')
            plt.title(f'Confusion Matrix for Fold {fold + 1}')
            plt.savefig(os.path.join(self.config.figures, f'cm_fold_{fold + 1}.png'))
        
        return val_losses, val_accuracies, avg_val_loss, val_accuracy
    
    def train_plot(self, range, train_matric, val_matric, train_label, val_label, fold):
        plt.figure(figsize=(12, 6))
        plt.plot(range, train_matric, label=train_label)
        plt.plot(range, val_matric, label=val_label)
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.title(f'Train/validation Loss for Fold {fold + 1}')
        plt.legend()
        plt.savefig(os.path.join(self.config.figures, f'Train_Val_{str.split(train_label)[-1]}_Fold_{fold + 1}.png'))
    
    def train(self):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print('Device: ', device)
        
        dataset, skf = self.dataset_folds_preparation()
        
        for fold, (train_idx, val_idx) in tqdm(enumerate(skf.split(dataset.data_frame, dataset.data_frame['label']))):
            print(f'Fold {fold + 1}/{self.config.model_params.num_folds}')
            
            train_subset = Subset(dataset, train_idx)
            val_subset = Subset(dataset, val_idx)
            
            train_loader = DataLoader(
                train_subset,
                batch_size=self.config.model_params.batch_size,
                shuffle=True
            )
            val_loader = DataLoader(
                val_subset,
                batch_size=self.config.model_params.batch_size,
                shuffle=False
            )
            
            model = models.efficientnet_b0(weights=EfficientNet_B0_Weights.DEFAULT)
            model.classifier[1] = nn.Sequential(
                nn.Linear(
                    in_features=1280,
                    out_features=512
                ),
                nn.ReLU(),
                nn.Linear(
                    in_features=512,
                    out_features=self.config.model_params.num_classes
                )
            )
            model.to(device)
            
            criterion = torch.nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=self.config.model_params.lr)
            
            train_losses = []
            val_losses = []
            train_accuracies = []
            val_accuracies = []
            best_val_loss = float('inf')
            
            for epoch in tqdm(range(self.config.model_params.num_epochs)):
                model.train()
                running_loss = 0.0
                correct_train = 0
                total_train = 0
                
                for images, labels in tqdm(train_loader):
                    images, labels = images.to(device), labels.to(device)
                    
                    optimizer.zero_grad()
                    
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    
                    loss.backward()
                    optimizer.step()

                    running_loss += loss.item()
                    
                    _, predicted = torch.max(outputs.data, 1)
                    total_train += labels.size(0)
                    correct_train += (predicted == labels).sum().item()
                    
                avg_train_loss = running_loss / len(train_loader)
                train_losses.append(avg_train_loss)
                train_accuracy = 100 * correct_train / total_train
                train_accuracies.append(train_accuracy)
                
                val_losses, val_accuracies, avg_val_loss, val_accuracy = self.validation(
                    device,
                    fold,
                    model,
                    criterion,
                    val_loader,
                    val_losses,
                    val_accuracies,
                    epoch,
                    best_val_loss
                )
                
                print(f'Epoch [{epoch+1}/{self.config.model_params.num_epochs}], '
                      f'Loss: {avg_train_loss:.4f}, '
                      f'Validation Loss: {avg_val_loss:.4f}, '
                      f'Train Accuracy: {train_accuracy:.2f}%, '
                      f'Validation Accuracy: {val_accuracy:.2f}%')
            
            epochs_range = range(1, self.config.model_params.num_epochs + 1)
            
            self.train_plot(
                epochs_range,
                train_losses,
                val_losses,
                'Train Loss',
                'Validation Loss',
                fold
            )
            
            self.train_plot(
                epochs_range,
                train_accuracies,
                val_accuracies,
                'Train Accuracy',
                'Validation Accuracy',
                fold
            )
            
            print(f'Finished fold {fold + 1}/{self.config.model_params.num_folds}\n')
        
        print('Training completed.')

In [14]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()

except Exception as e:
    raise e

[2024-11-04 00:22:40,547: INFO: main_utils: created directory at: artifacts]
[2024-11-04 00:22:40,548: INFO: main_utils: created directory at: artifacts/model_trainer/models]
[2024-11-04 00:22:40,548: INFO: main_utils: created directory at: artifacts/model_trainer/figures]
Device:  cuda


0it [00:00, ?it/s]

Fold 1/2



  0%|          | 0/85 [00:00<?, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s]
0it [00:00, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 5.92 GiB of which 37.94 MiB is free. Including non-PyTorch memory, this process has 5.48 GiB memory in use. Of the allocated memory 5.30 GiB is allocated by PyTorch, and 89.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)