In [1]:
import os
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/Cell_Segmentation_YOLO-v8/research'

In [2]:
os.chdir('../')
%pwd

'/mnt/cb03386d-9344-47b1-82f9-868fbb64b4ae/python_projects/Cell_Segmentation_YOLO-v8'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    train_path: Path
    validation_path: Path
    val_size: float
    apply_aug: bool
    aug_size: int

In [4]:
from src.cellseg.constant import *
from src.cellseg.utils.main_utils import create_directories, read_yaml

class ConfigurationManager:
    def __init__(
        self,
        config_file_path = CONFIG_FILE_PATH,
        params_file_path = PARAMS_FILE_PATH,
        schema_file_path = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        self.schema = read_yaml(schema_file_path)
        
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        
        create_directories([config.root_dir, config.train_path, config.validation_path])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            train_path=config.train_path,
            validation_path=config.validation_path,
            val_size=config.val_size,
            apply_aug=config.apply_aug,
            aug_size=config.aug_size
        )
        
        return data_transformation_config

In [26]:
from src.cellseg import logger
import shutil
import cv2
from sklearn.model_selection import train_test_split
import albumentations as A
from tqdm import tqdm
import yaml
from pathlib import Path


class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def data_augmentation(self):
        logger.info("Data augmentation started!")
        for dir in tqdm(os.listdir(self.config.data_path)):
            img_path = os.path.join(
                self.config.data_path,
                dir,
                'images',
                dir + '.png'
            )
            image = cv2.cvtColor(cv2.imread(img_path, 1), cv2.COLOR_BGR2RGB)
            crop_dim = min(image.shape[0], image.shape[1])
            
            transform = A.Compose([
                A.Crop(x_min=0, y_min=0, x_max=crop_dim, y_max=crop_dim, always_apply=True),
                A.Resize(height=256, width=256, always_apply=True),
                A.RandomBrightnessContrast(brightness_limit=0.18, contrast_limit=0.18, p=0.5),
                A.RandomGamma(gamma_limit=(95, 105), p=0.5),
                A.Rotate(limit=120, p=0.7, border_mode=cv2.BORDER_REFLECT),
                A.HorizontalFlip(p=0.8),
                A.VerticalFlip(p=0.8)
            ])
            
            masks_list = []
            for cell_mask in os.listdir(os.path.join(self.config.data_path, dir, 'masks')):
                cell_mask_img = cv2.imread(os.path.join(
                    self.config.data_path,
                    dir,
                    'masks',
                    cell_mask
                ), 0)
                
                masks_list.append(cell_mask_img)
            
            for i in range(self.config.aug_size):
                augmentations = transform(image=image, masks=masks_list)
                
                dir_image_path = os.path.join(
                    self.config.data_path,
                    dir + '_' + str(i),
                    'images',
                )
                os.makedirs(dir_image_path, exist_ok=True)
                cv2.imwrite(
                    os.path.join(dir_image_path, dir + '_' + str(i) + '.png'),
                    augmentations['image']
                )
                
                dir_mask_path = os.path.join(
                    self.config.data_path,
                    dir + '_' + str(i),
                    'masks',
                )
                os.makedirs(dir_mask_path, exist_ok=True)
                for i, mask in enumerate(augmentations['masks']):
                    cv2.imwrite(
                        os.path.join(dir_mask_path, dir + '_mask_' + str(i) + '.png'),
                        mask
                    )
            
            # shutil.rmtree(os.path.join(self.config.data_path, dir))
        logger.info("Data augmentation finished!")

    def data_to_YOLO_formating(self):
        logger.info("YOLO formating started!")
        if self.config.apply_aug:
            marker = '_'
        else:
            marker = ''
            
        for dir in tqdm(os.listdir(self.config.data_path)):
            if marker in dir:
                img_path = os.path.join(
                    self.config.data_path,
                    dir,
                    'images',
                    dir + '.png'
                )

                if self.config.apply_aug:
                    shutil.move(img_path, self.config.train_path)
                else:
                    shutil.copy2(img_path, self.config.train_path)

                masks = ''
                
                for cell_mask in os.listdir(os.path.join(self.config.data_path, dir, 'masks')):
                    
                    cell_mask_str = '0'

                    cell_mask_img = cv2.imread(os.path.join(
                        self.config.data_path,
                        dir,
                        'masks',
                        cell_mask
                    ), 0)

                    contours, _ = cv2.findContours(
                        cell_mask_img,
                        cv2.RETR_LIST,
                        cv2.CHAIN_APPROX_SIMPLE
                    )
                    
                    if contours:
                        for dot in contours[0]:
                            cell_mask_str += ' ' + str(dot[0][1] / 255) + ' ' + str(dot[0][0] / 255)

                        masks += cell_mask_str + '\n'

                with open(os.path.join(self.config.train_path, dir + '.txt'), 'w') as file:
                    file.write(masks)

                if self.config.apply_aug:
                    shutil.rmtree(os.path.join(self.config.data_path, dir))
        logger.info("YOLO formating finished!")

    def train_validation_separation(self):
        logger.info("Train/validation split started!")
        
        img_list = os.listdir(self.config.train_path)
        img_list = [s for s in img_list if '.png' in s]
        
        _, val_list = train_test_split(
            img_list,
            test_size=self.config.val_size,
            random_state=42,
            shuffle=True
        )
        
        for img in val_list:
            img_path = os.path.join(self.config.train_path, img)
            ann_path = os.path.join(self.config.train_path, str.split(img, '.')[0] + '.txt')
            
            shutil.move(img_path, self.config.validation_path, )
            shutil.move(ann_path, self.config.validation_path)
        
        logger.info("Train/validation split finished!")
    
    def dataset_yaml_creation(self):
        yaml_content = {
            'train': self.config.train_path,
            'val': self.config.validation_path,
            'test': 'artifacts/data_ingestion/test',
            'nc': 1,
            'names': ['Cell']
        }
        
        yaml_file = yaml.safe_dump(yaml_content, default_flow_style=None, sort_keys=False)
        
        with open(os.path.join(self.config.root_dir, 'dataset.yaml'), 'w') as file:
            file.write(yaml_file)
        logger.info("File dataset.yaml created!")

    def sequence_transformation(self):
        if self.config.apply_aug:
            self.data_augmentation()
            self.data_to_YOLO_formating()
            self.train_validation_separation()
            self.dataset_yaml_creation()
        else:
            self.data_to_YOLO_formating()
            self.train_validation_separation()
            self.dataset_yaml_creation()

In [28]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.sequence_transformation()

except Exception as e:
    raise e

[2024-10-12 22:35:07,494: INFO: main_utils: created directory at: artifacts]
[2024-10-12 22:35:07,495: INFO: main_utils: created directory at: artifacts/data_transformation]
[2024-10-12 22:35:07,496: INFO: main_utils: created directory at: artifacts/data_transformation/train]
[2024-10-12 22:35:07,498: INFO: main_utils: created directory at: artifacts/data_transformation/validation]
[2024-10-12 22:35:07,499: INFO: 4139087493: Data augmentation started!]


100%|██████████| 670/670 [01:48<00:00,  6.17it/s]

[2024-10-12 22:36:56,126: INFO: 4139087493: Data augmentation finished!]
[2024-10-12 22:36:56,128: INFO: 4139087493: YOLO formating started!]



100%|██████████| 4020/4020 [01:27<00:00, 45.74it/s]

[2024-10-12 22:38:24,026: INFO: 4139087493: YOLO formating finished!]
[2024-10-12 22:38:24,027: INFO: 4139087493: Train/validation split started!]
[2024-10-12 22:38:24,092: INFO: 4139087493: Train/validation split finished!]
[2024-10-12 22:38:24,094: INFO: 4139087493: File dataset.yaml created!]



