In [1]:
# Step 12 : updating the config.yaml file.
# Whenever we need to work with data then it is usually present outside of our system and we can download it for our use.
# When we download the data then it needs to be kept somewhere. So to do so, we create a folder called artifacts then create another
# folder called data_ingestion and then download our zip file. 
# We then need to unzip this file. For this we will create an unzip function that will unzip the data and then we will store it in
# this data_ingestion folder. 

# This is what we are doing in config.yaml file. We are creating a folder called artifacts. We have assiged a key name to this 
# called artifacts_root. Which means, whenever we call the artifacts_root then it will return artifacts. This is because yaml file
# return key value pairs. 

# Then we created another key called data_ingestion which contains all the data ingestion related configuration. 
# So the firstly it will create a folder called data_ingestion under artifacts. Then we give the source_url.
# Then we also define the local data file name using the local_data_file key. By this we are merely trying to rename wherver 
# Zip file is downloaded to "data".
# We also provide the unzip folder location.
# Checkout config.yaml file for all this implementation. 
# Step 12 completed.

In [2]:
# Step 13: Schema.yaml file.
# Next step in our workflows is to update the schema.yaml file. 
# We need to define our schema here. We can get the schema reference from df.info function in pandas. 
# We need to divide this file into two parts = Columns and Target Column
# So structure of this schema file:
# Columns
# column name: data type
# Target column
# name: column name
# We will then use this yaml file for validation purpose to confirm that we have all the columns that are required in our dataset.
# Go to schema.yaml file to understand how this is prepared. 
# Step 13 Completed

In [3]:
# Step 14: Updating the params.yaml file. 
# Initially we will not have anything here. But we cannot keep this file empty as it will throw an error when we will try to execute.
# To prevent this we will just write below:
# key: val 
# as dummy place holders.
# Step 14: Completed

In [4]:
# Step 15: Updating Entity
# In this notebook find the present working directory using the below command
# %pwd
# It will give path till /research as the notebook is created in /research directory.
# If we need to move on position up then we can use the following command
# os.chdir("../")
# Now if you do %pwd then it will give path till /ETE.
# Entity is nothing but return type of a function. 

In [5]:
%pwd

'c:\\Users\\RadhikaMaheshwari\\Desktop\\Test\\DeepLearning\\ETE\\research'

In [6]:
import os
os.chdir("../")

In [7]:
%pwd

'c:\\Users\\RadhikaMaheshwari\\Desktop\\Test\\DeepLearning\\ETE'

In [8]:
from dataclasses import dataclass
from pathlib import Path

# In the code below, we are definfing the datatype of all the key value pairs in the config.yaml file for data_ingestion. 
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str 
    local_data_file: Path
    unzip_dir: Path

In [9]:
# Step 15 Completed.

In [10]:
# Step 16: Update configuration manager in src config. 
# Configuration manager will return file path etc in one path.

# For doing this, we will first need to get the file path. To get the file path, we will first go to __init__.py file in constants
# and then we will define the paths there so that our configuration manager can then fetch it. 
# Go to constants and checkout code. 

In [11]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [12]:
class ConfigurationManager:
    def __init__(
        self,
        # ALl of these paths are coming from the constants package that we have imported. So our code is able to understand what
        # CONFIG_FILE_PATH is when we are trying to reference it.
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH
    ):
        # Now we need to read these yaml files. So above we have imported the utils.common package where our read_yaml function is 
        # defined so we are going to use it. 
        logger.info(f"Config file path is {config_filepath}")
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        # Lastly in the config.yaml file we needed to create directories too so we are going to use the create_directories
        # function from the common package. Self.config because we have defined a variable above self.config which is reading the
        # config_filepath. So this means this will have all the methods and variables of the config_yaml file which is 
        # artifacts_root.
        create_directories([self.config.artifacts_root])
    
    # Below we have written DataIngestionConfig as return type as we have already defined this above. Now what this will do is
    # it will only return the items which we have mentioned above. 
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        # now we have already created self.config above in the construtor. This is able to read everything that is present in
        # the config.yaml file. We now are using it to get the data.ingestion part. Just above, we used it to read the
        # artifacts_root part and create that directory. 
        config = self.config.data_ingestion

        # now we need to create the data_ingestion directory under the artifacts directory.
        # we are passing this as list as this is the format expected in our function defined in common
        create_directories([config.root_dir]) 

        # once this is done, we are returning everything in data_ingestion
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL=config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir=config.unzip_dir
        )

        return data_ingestion_config


In [13]:
# step 16 complete

In [14]:
# Step 17. We need to update components. 
# now we need to create a data ingestion component. For this, we will again create a class

import os
# using the request data we will be able to download the data from url itself. 
import urllib.request as request
# with the help of zipfile package, we will be able to unzip the file. 
import zipfile
from mlProject import logger
# We are using this file to check the size of the file downloaded.
from mlProject.utils.common import get_size

In [19]:
# defining data ingestion class
class DataIngestion:
    # first we will pass the DataIngestionConfig that we have received from the configuration manager to constructor.
    def __init__(self, config:DataIngestionConfig):
        self.config = config
     
    # now first we are downloading the file
    def download_file(self):
        #In the if function, we are saying that if the local_data_path 
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with the info \n {headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")

    # Now once the data is downloaded, we need to extract the zip file
    def extract_zip_file(self):
        """
            zip_file_path: str
            Extracts the zip file into data directory
            Function returns none
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file,'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [20]:
# Step 17 Complete.

In [21]:
# Step 18: Update Pipeline
# Pipleine is just a step to call methods to determine which method should be called first and then after.
# Like in above case, we first need to download the data and then we need to unzip it. 
# In pipeline, we mention the flow of function calls. 

In [22]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2024-05-21 21:19:34,493: INFO: 3393835054: Config file path is config\config.yaml]
[2024-05-21 21:19:34,496: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-05-21 21:19:34,498: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-21 21:19:34,503: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-05-21 21:19:34,505: INFO: common: Created Directory at: artifacts]
[2024-05-21 21:19:34,507: INFO: common: Created Directory at: artifacts/data_ingestion]
[2024-05-21 21:19:36,037: INFO: 530598795: artifacts/data_ingestion/data.zip download! with the info 
 Connection: close
Content-Length: 26148
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "2651a00c9b029e20e8a4b41aa35eb1eebca07987c66560680e0679f37f97af83"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Requ

In [23]:
# step 18 completed. Now you can see a folder called artifacts got created. In that you can see a folder called data ingestion. 
# data.zip has been downloaded. And our data is also unzipped. 

In [None]:
# Now we need to convert all of this into modular coding as currently we have done everything in jupyter notebook.
# We are going to just copy paste the info. 