In [1]:
import os
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/facial_expression_detection/research'

In [2]:
os.chdir('../')
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/facial_expression_detection'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    dataset_folder: Path
    transformed_dataset: Path
    dataset_labels_src: Path
    dataset_labels: Path
    params: dict
    dataset_val_status: bool

In [4]:
from src.detmood.constant import *
from src.detmood.utils.main_utils import create_directories, read_yaml

class ConfigurationManager:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,
        schema_file_path = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)
        
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        dataset_val_status_file = self.config.data_validation.STATUS_FILE
        
        with open(dataset_val_status_file, 'r') as f:
            status = f.read()
        
        status = bool(str.split(status)[-1])
        
        create_directories([config.transformed_dataset])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            dataset_folder=config.dataset_folder,
            transformed_dataset=config.transformed_dataset,
            dataset_labels_src=config.dataset_labels_src,
            dataset_labels=config.dataset_labels,
            params=self.params,
            dataset_val_status=status
        )
        
        return data_transformation_config

In [5]:
import cv2
from tqdm import tqdm
import pandas as pd
from torchvision import transforms
from src.detmood.constant.dataset_preparation import CustomImageDataset
from sklearn.model_selection import StratifiedKFold
import shutil

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def ungroup_folder_classes(self):
        if not os.listdir(self.config.transformed_dataset):
            for dir in os.listdir(self.config.dataset_folder):
                os.makedirs(os.path.join(self.config.transformed_dataset, dir), exist_ok=True)
                for class_dir in tqdm(os.listdir(os.path.join(self.config.dataset_folder, dir))):
                    for img in tqdm(os.listdir(os.path.join(self.config.dataset_folder, dir, class_dir))):
                        shutil.copy2(
                            os.path.join(self.config.dataset_folder, dir, class_dir, img),
                            os.path.join(self.config.transformed_dataset, dir)
                        )
            
            shutil.copy2(self.config.dataset_labels_src, self.config.dataset_labels)
    
    def equalize_histogram(self, img):
        img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        img_hsv[:,:,2] = cv2.equalizeHist(img_hsv[:,:,2])
        img_eq = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)
        
        return img_eq
    
    def noise_reduction(self, img):
        img_filt = cv2.medianBlur(
            img,
            self.config.params.transform.noise_reduction.median_filter_size
        )
        
        return img_filt
    
    def labels_csv_transform(self):
        labels_df = pd.read_csv(self.config.dataset_labels_src)
        
        for ind in labels_df.index:
            labels_df.loc[ind, 'label'] = MOOD_DICT[labels_df.loc[ind, 'label']]
        
        labels_df.to_csv(self.config.dataset_labels, index=False)
    
    def dataset_folds_preparation(self):
        transform = transforms.Compose([
            transforms.Resize((
                self.config.params.model.img_in_size,
                self.config.params.model.img_in_size
            )),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            transforms.RandomAffine(translate=(0.1, 0.1), degrees=15),
            transforms.RandomResizedCrop((224, 224), scale=(0.8, 1.0)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        
        dataset = CustomImageDataset(
            self.config.dataset_labels,
            self.config.dataset_folder,
            self.config.params.model.data_aug_size,
            transform=transform
        )
        
        skf = StratifiedKFold(
            n_splits=self.config.params.model.num_folds,
            shuffle=True,
            random_state=42
        )
        
        splits = skf.split(dataset.balanced_frame, dataset.balanced_frame['label'])
        
        return dataset, splits
    
    def transformation_compose(self):
        if self.config.dataset_val_status:
            # if len(os.listdir(self.config.dataset_folder)) == 0:
            self.ungroup_folder_classes()
            # for img_name in tqdm(os.listdir(self.config.transformed_dataset)):
            for img_name in tqdm(os.listdir(os.path.join(self.config.transformed_dataset, 'train'))):
                img = cv2.imread(os.path.join(self.config.transformed_dataset, 'train', img_name))
                img_eq = self.equalize_histogram(img)
                img_filt = self.noise_reduction(img_eq)
                
                cv2.imwrite(os.path.join(self.config.transformed_dataset, 'train', img_name), img_filt)
            
            # self.labels_csv_transform()

            
            dataset, splits = self.dataset_folds_preparation()
            
            return dataset, splits
        else:
            print("Dataset is not valid!")

In [6]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    dataset, splits = data_transformation.transformation_compose()

except Exception as e:
    raise e

[2024-11-27 20:52:38,877: INFO: main_utils: created directory at: artifacts]
[2024-11-27 20:52:38,879: INFO: main_utils: created directory at: artifacts/data_transformation/DATASET]


100%|██████████| 329/329 [00:00<00:00, 7556.64it/s]
100%|██████████| 74/74 [00:00<00:00, 7404.24it/s]
100%|██████████| 1185/1185 [00:00<00:00, 6791.07it/s]
100%|██████████| 680/680 [00:00<00:00, 8054.65it/s]
100%|██████████| 160/160 [00:00<00:00, 8075.77it/s]
100%|██████████| 478/478 [00:00<00:00, 7701.08it/s]
100%|██████████| 162/162 [00:00<00:00, 7976.86it/s]
100%|██████████| 7/7 [00:00<00:00, 15.80it/s]
100%|██████████| 1290/1290 [00:00<00:00, 9076.08it/s]
100%|██████████| 281/281 [00:00<00:00, 6837.61it/s]
100%|██████████| 4772/4772 [00:00<00:00, 8668.48it/s]
100%|██████████| 2524/2524 [00:00<00:00, 8361.49it/s]
100%|██████████| 717/717 [00:00<00:00, 7684.82it/s]
100%|██████████| 1982/1982 [00:00<00:00, 7473.11it/s]
100%|██████████| 705/705 [00:00<00:00, 7435.89it/s]
100%|██████████| 7/7 [00:01<00:00,  4.59it/s]
100%|██████████| 12271/12271 [00:05<00:00, 2191.99it/s]

{np.int64(5): np.int64(14316), np.int64(4): np.int64(14316), np.int64(1): np.int64(14316), np.int64(6): np.int64(14316), np.int64(2): np.int64(14316), np.int64(3): np.int64(14316), np.int64(7): np.int64(14316)}





In [8]:
len(dataset.balanced_frame)

100212