In [1]:
import os


In [3]:
%pwd

'c:\\Users\\paago\\Documents\\Chicken_Disease_Classification\\research'

In [5]:
os.chdir("../")

In [6]:
%pwd

'c:\\Users\\paago\\Documents\\Chicken_Disease_Classification'

In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source: str
    source_csv: str
    local_data_file: Path
    unzip_dir: Path
    train_size: float = None
    test_size: float = None
    validation_size: float = None
    random_state: int = None
    max_samples: int = None
    min_samples: int = None
    img_size: list = None
    working_dir: str = None
    batch_size: int = None  


In [8]:
import os
import pandas as pd
from pathlib import Path
from dataclasses import dataclass
import yaml
import zipfile
from sklearn.model_selection import train_test_split
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from cnnClassifier.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from cnnClassifier.utils.common import read_yaml, create_directories, get_size
from cnnClassifier.entity.config_entity import DataIngestionConfig
from cnnClassifier import logger




In [10]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        print("Config: ", self.config)
        self.params = read_yaml(params_filepath)  
        print("Params: ", self.params)
        create_directories([self.config.data_ingestion.root_dir])

    def copy_zip_data(self):
        source_path = self.config.data_ingestion.source
        destination_path = self.config.data_ingestion.local_data_file
        if not os.path.exists(destination_path):
            shutil.copy2(source_path, destination_path)
            logger.info(f"Zip file copied from {source_path} to {destination_path}.")
        else:
            logger.info(f"File {destination_path} already exists.")

    def unzip_data(self):
        try:
            with zipfile.ZipFile(self.config.data_ingestion.local_data_file, 'r') as zip_ref:
                zip_ref.extractall(self.config.data_ingestion.unzip_dir)
            logger.info("Zip file extracted successfully.")
        except FileNotFoundError:
            logger.error(f"Zip file not found at path: {self.config.data_ingestion.local_data_file}")
            raise
        except Exception as e:
            logger.error(f"An error occurred while extracting zip file: {e}")
            raise

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        params = self.params.data_ingestion

        create_directories([config.root_dir, params.working_dir])
        self.copy_zip_data()
        self.unzip_data()

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source=config.source,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir,
            source_csv=config.source_csv,
            train_size=params.train_size,
            test_size=params.test_size,
            validation_size=params.validation_size,
            random_state=params.random_state,
            max_samples=params.max_samples,
            min_samples=params.min_samples,
            img_size=params.img_size,
            working_dir=params.working_dir
        )

        return data_ingestion_config



In [11]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    
    def load_csv(self):
        logger.info("Loading CSV file.")
        df = pd.read_csv(self.config.source_csv)
        df.columns=['filepaths', 'labels']
        df['filepaths'] = df['filepaths'].apply(lambda x: os.path.join(self.config.root_dir, x))
        logger.info(f"CSV file loaded successfully with {df.shape[0]} rows.")
        return df

    
    def split_data(self, df):
        logger.info("Splitting data into training, testing, and validation sets.")
        trsplit = self.config.train_size
        vsplit = self.config.validation_size
        dsplit = vsplit / (1 - trsplit)
        strat = df['labels']
        train_df, dummy_df = train_test_split(df, train_size=trsplit, shuffle=True, random_state=self.config.random_state, stratify=strat)
        strat = dummy_df['labels']
        test_df, valid_df = train_test_split(dummy_df, train_size=dsplit, shuffle=True, random_state=self.config.random_state, stratify=strat)
        logger.info("Data split successfully.")
        logger.info(f"Train set size: {train_df.shape[0]}")
        logger.info(f"Test set size: {test_df.shape[0]}")
        logger.info(f"Validation set size: {valid_df.shape[0]}")

        return train_df, test_df, valid_df



    def class_distribution(self, train_df):
        logger.info("Getting class distribution.")
        groups = train_df.groupby('labels')  
        print('{0:^30s} {1:^13s}'.format('CLASS', 'IMAGE COUNT'))
        for label in train_df['labels'].unique():
            print('{0:^30s} {1:^13d}'.format(label, len(groups.get_group(label))))
        print('\n')


    def trim(self, train_df):
        logger.info("Trimming classes.")
        max_size = self.config.max_samples
        min_size = self.config.min_samples
        column = 'labels'
        train_df = train_df.copy()
        original_class_count= len(list(train_df[column].unique()))
        logger.info('Original Number of classes in dataframe: %s', original_class_count)
        sample_list=[] 
        groups=train_df.groupby(column)
        for label in train_df[column].unique():        
            group=groups.get_group(label)
            sample_count=len(group)         
            if sample_count> max_size :
                strat=group[column]
                samples,_=train_test_split(group, train_size=max_size, shuffle=True, random_state=self.config.random_state, stratify=strat)            
                sample_list.append(samples)
            elif sample_count>= min_size:
                sample_list.append(group)
        train_df=pd.concat(sample_list, axis=0).reset_index(drop=True)
        final_class_count= len(list(train_df[column].unique())) 
        if final_class_count != original_class_count:
            logger.warning('*** WARNING***  dataframe has a reduced number of classes' )
        balance=list(train_df[column].value_counts())
        logger.info('Class balance: %s', balance)
        return train_df
    
    def prepare_working_dir(self):
        logger.info("Preparing working directory.")
        os.makedirs(self.config.working_dir, exist_ok=True)
        logger.info("Working directory prepared.")

    
    def execute(self):
        logger.info("Starting data ingestion.")
        df = self.load_csv()
        train_df, test_df, valid_df = self.split_data(df)
        self.class_distribution(train_df)
        train_df = self.trim(train_df)
        self.prepare_working_dir()

        # save the dataframes to CSV files
        print("About to write train.csv")
        train_df.to_csv(os.path.join(self.config.working_dir, 'train.csv'), index=False)
        print("Finished writing train.csv")

        test_df.to_csv(os.path.join(self.config.working_dir, 'test.csv'), index=False)
        valid_df.to_csv(os.path.join(self.config.working_dir, 'valid.csv'), index=False)


        logger.info("Data ingestion completed successfully.")
        return train_df, test_df, valid_df


In [14]:
# Create a ConfigurationManager instance and fetch data ingestion config
cm = ConfigurationManager()
config = cm.get_data_ingestion_config()

[2023-07-31 04:48:10,636: INFO: common: yaml file: config\config.yaml loaded successfully]
Config:  {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source': 'C:\\\\Users\\\\paago\\\\Downloads\\\\Chicken_Fecal.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion', 'source_csv': 'artifacts\\\\data_ingestion\\\\Fecal_data.csv'}}
[2023-07-31 04:48:10,641: INFO: common: yaml file: params.yaml loaded successfully]
Params:  {'base_model': {'image_size': [224, 224], 'model_name': 'EfficientNetB5', 'weights': 'imagenet', 'include_top': False, 'pooling': 'max'}, 'full_model': {'dropout_rate1': 0.3, 'dropout_rate2': 0.45, 'dense_1024_regularizer_l2': 0.016, 'dense_1024_regularizer_l1': 0.006, 'dense_128_regularizer_l2': 0.016, 'dense_128_regularizer_l1': 0.006}, 'learning_rate': 0.001, 'classes': 10, 'epochs': 50, 'ask_epoch': 10, 'data_ingestion': {'train_size': 0.9, 'test_size': 0.05, 'validation_size': 

In [15]:
data_ingestion = DataIngestion(config)
train_df, test_df, valid_df = data_ingestion.execute()

print("Train DataFrame:")
print(train_df.head())
print("\nTest DataFrame:")
print(test_df.head())
print("\nValidation DataFrame:")
print(valid_df.head())

[2023-07-31 04:48:35,646: INFO: 3530614321: Starting data ingestion.]
[2023-07-31 04:48:35,647: INFO: 3530614321: Loading CSV file.]
[2023-07-31 04:48:35,683: INFO: 3530614321: CSV file loaded successfully with 8067 rows.]
[2023-07-31 04:48:35,683: INFO: 3530614321: Splitting data into training, testing, and validation sets.]
[2023-07-31 04:48:35,692: INFO: 3530614321: Data split successfully.]
[2023-07-31 04:48:35,693: INFO: 3530614321: Train set size: 7260]
[2023-07-31 04:48:35,693: INFO: 3530614321: Test set size: 403]
[2023-07-31 04:48:35,694: INFO: 3530614321: Validation set size: 404]
[2023-07-31 04:48:35,695: INFO: 3530614321: Getting class distribution.]
            CLASS               IMAGE COUNT 
         Coccidiosis               2228     
          Salmonella               2362     
           Healthy                 2164     
      New Castle Disease            506     


[2023-07-31 04:48:35,699: INFO: 3530614321: Trimming classes.]
[2023-07-31 04:48:35,702: INFO: 3530614