In [1]:
import os

In [2]:
%pwd

'd:\\end-to-end-ml-project-with-mlflow\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\end-to-end-ml-project-with-mlflow'

### Step 4 of the README.md i.e. updation of the entity starts from here... Entity is just the return type of the function i.e. whenever we define our configuration function, it's return type will be defined by the entity.

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig():
    root_dir:Path
    local_data_file:Path
    download_dir:Path

### Step 5 of the README.md i.e. the updation of the configuration manager starts from here... It manages the Configuration(OR the structure) of the current stage(here, data_ingestion). It is responsible to create various file folders & directories where the data needs to be ingested and saved.

In [6]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager():
    def __init__(self,config_filepath = CONFIG_FILE_PATH,params_filepath=PARAMS_FILE_PATH,schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            local_data_file=config.local_data_file,
            download_dir=config.download_dir
        )

        return data_ingestion_config

### Step 6 of the README.md i.e. Updation OR creation of the data_ingestion components starts from here. Here we are fetching the data from the S3 bucket so, this step includes the reading of the data OR components(i.e. the csv file) and then downloading it(inside the directory as specified just above in the configuration manager).

In [8]:
import os
from mlProject import logger
from mlProject.utils.common import get_size
import boto3

In [9]:
class DataIngestion():
    def __init__(self,config:DataIngestionConfig):
        self.config = config

    def get_file(self):
        if not os.path.exists(self.config.local_data_file):
            s3 = boto3.client('s3')
            s3 = boto3.resource(
                service_name='s3',
                region_name='us-east-1',
                aws_access_key_id = '',
                aws_secret_access_key = ''
            )
            for obj in s3.Bucket('vedanshaws').objects.all():
                filename = obj.key
            
            s3.Bucket('vedanshaws').download_file(Key='winequality-red.csv', Filename=self.config.download_dir)
            logger.info(f"{filename} downloaded from S3 bucket!")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")
            

### Step 7 of the README.md starts from here i.e. Updation of the data_ingestion pipeline. This pipeline is solely responsible for execution of the entire steps done above.

In [10]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.get_file()
except Exception as e:
    raise e

[2023-11-20 21:42:24,829: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-11-20 21:42:24,829: INFO: common: yaml file: params.yaml loaded successfully]
[2023-11-20 21:42:24,838: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-11-20 21:42:24,839: INFO: common: created directory at: artifacts]
[2023-11-20 21:42:24,839: INFO: common: created directory at: artifacts/data_ingestion]
[2023-11-20 21:42:24,839: INFO: 323823785: File already exists of size: ~ 99 KB]
