In [2]:
%pwd

'd:\\Documentation_sample\\research'

In [3]:
import os

os.chdir("../")

In [4]:
%pwd

'd:\\Documentation_sample'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessConfig:
    root_dir: Path
    data_dir : Path
    train_loader_dir: Path
    valid_loader_dir: Path
    test_loader_dir: Path
    params_batch_size : int
    params_valid_size : float
    params_image_dim : int


In [6]:
from sample_project.constants import *
from sample_project.utils.common import read_yaml, create_directories
from pathlib import Path

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_preprocess_config(self) -> DataPreprocessConfig:
        config = self.config.data_preprocessing
        params = self.params

        create_directories([config.root_dir])

        data_preprocess_config = DataPreprocessConfig(
            root_dir=config.root_dir,
            data_dir = config.data_dir,
            train_loader_dir = config.train_loader_dir,
            valid_loader_dir = config.valid_loader_dir,
            test_loader_dir = config.test_loader_dir,
            params_image_dim = params.IMAGE_DIM,
            params_batch_size= params.BATCH_SIZE,
            params_valid_size = params.VALID_SIZE

        )

        return data_preprocess_config

In [8]:
import os
import urllib.request as request
import zipfile
from sample_project import logger
from sample_project.utils.common import get_size
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import torch.utils.data as data
import numpy as np
import torch

In [9]:
class DataPreprocess:
    def __init__(self, config: DataPreprocessConfig):
        self.config = config
    
    def transform_data(self):
        image_dim  =self.config.params_image_dim
        data_dir = self.config.data_dir

        train_transform = transforms.Compose([transforms.ToTensor(),
                                      transforms.Resize((image_dim,image_dim)),
                                      transforms.Normalize((0.1307,), (0.3081,))])
        test_transform = transforms.Compose([transforms.ToTensor(),
                                     transforms.Resize((image_dim,image_dim)),
                                     transforms.Normalize((0.1307,), (0.3081,))])
        

        self.train_data = datasets.MNIST(root = data_dir,
                            train = True,                         
                            transform = train_transform,
                            
                            )
        self.test_data = datasets.MNIST( root = data_dir, 
                            train = False, 
                            transform = test_transform,
                            )
    def split_valid_set(self):
    
        valid_size = self.config.params_valid_size

        train_data = self.train_data
        test_data  = self.test_data

        valid_set_length = int(valid_size*len(train_data))
        train_set_length = len(train_data) - valid_set_length

        train_data,valid_data=data.random_split(train_data,[train_set_length,valid_set_length])

        self.valid_data = valid_data


        print("Train_data length:",len(train_data),"Valid_data length:",len(valid_data),"Test_data length:",len(test_data))
    
    def data_loaders(self):

        batch_size = self.config.params_batch_size

        train_iterator= data.DataLoader(self.train_data,
                                        shuffle=True,
                                        batch_size=batch_size)

        valid_iterator= data.DataLoader(self.valid_data,
                                        batch_size=batch_size)

        test_iterator= data.DataLoader(self.test_data,
                                        batch_size=batch_size)
        
        self.save_loader(self.config.train_loader_dir,train_iterator)
        self.save_loader(self.config.valid_loader_dir,valid_iterator)
        self.save_loader(self.config.test_loader_dir,test_iterator)
        
    @staticmethod
    def save_loader(path:Path,loader: data.DataLoader):
        torch.save(loader,path)
        print("Saved at ", path)
                




In [10]:
try:
    config = ConfigurationManager()
    data_preprocess__config = config.get_data_preprocess_config()
    data_preprocess = DataPreprocess(config=data_preprocess__config)
    data_preprocess.transform_data()
    data_preprocess.split_valid_set()
    data_preprocess.data_loaders()
    
except Exception as e:
    raise e

[2024-07-28 01:56:41,341: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-28 01:56:41,344: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-28 01:56:41,345: INFO: common: created directory at: artifacts]
[2024-07-28 01:56:41,346: INFO: common: created directory at: artifacts/data_preprocessing]
Train_data length: 54000 Valid_data length: 6000 Test_data length: 10000
Saved at  artifacts/data_preprocessing/train_loader.ld
Saved at  artifacts/data_preprocessing/valid_loader.ld
Saved at  artifacts/data_preprocessing/test_loader.ld
