# Trial for data ingestion

In [1]:
import os

In [2]:
print(os.getcwd())

c:\Users\SoogeunPark\Desktop\December_2023\text_summarizer_cicd\Text-Summarizer\research


In [3]:
os.chdir("../")

In [4]:
print(os.getcwd())

c:\Users\SoogeunPark\Desktop\December_2023\text_summarizer_cicd\Text-Summarizer


Defining the entity:

This is an entity that grabs the information in ```config.yaml```:

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataIngestionConfig:
    """
    Configuration for data ingestion
    
    This defines the type of the objects. This matches the config.yaml in the config directory
    """
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [7]:
from TextSummarizer.constants import *
# the asterisk imports everything in the directory

In [8]:
from TextSummarizer.utils.common import read_yaml, create_directories

Now I create a class called ```ConfigurationManager```:

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH
    ):
        """
        This class is used to manage the configuration of the project
        """

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_roots]) 
        # this refers to the artifacts_roots in the config.yaml
        # this creates the artifacts_root directory
        # because self.config does the "read_yaml" function which uses ConfigBox, the artifacts_roots can be just accessed by using the dot notation

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        This function returns the data ingestion config
        """
        config = self.config.data_ingestion

        create_directories([config.root_dir])
        # this creates the root_dir, given in the config.yaml
                
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )
        
        return data_ingestion_config



Now that I have the ```ConfigurationManager```, by changing the ```config/config.yaml```, the changes can jsut be made to the config file, instead of changing them here.

Now let's create the ```components```.

In [10]:
import os
import urllib.request as request
import zipfile
from TextSummarizer.logging import logger
from TextSummarizer.utils.common import get_size

Creating a class called ```DataIngestion```, that takes the configuration made above:

In [18]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        """
        This class is used to ingest data
        """
        self.config = config

    # method that downloads the data from the url
    def download_file(self):

        # first, we check whether the data exists already. If it is not the case, we proceed and download the data
        if not os.path.exists(self.config.local_data_file) or get_size(Path(self.config.local_data_file)) == '~ 0 KB':
            filename, headers = request.urlretrieve(
                url = self.config.source_URL, 
                filename = self.config.local_data_file
                )
            logger.info(f"Downloaded file from {self.config.source_URL} \n to {self.config.local_data_file}. \nInfo: \n{headers}")

        else:
            logger.info(f"File already present at {self.config.local_data_file}. File size = {get_size(Path(self.config.local_data_file))}")

    def extract_zip_file(self):
        """
        This function extracts the zip file
        """
        # first defining the directory for unzipping
        # this is defined already in the config.yaml
        unzip_path = self.config.unzip_dir

        os.makedirs(unzip_path, exist_ok = True)

        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(self.config.unzip_dir)
        

### Now let's create the pipeline

In [21]:
try: 
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config = data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()

except Exception as e:
    raise e

[2023-12-29 16:29:18,786: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-29 16:29:18,789: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-29 16:29:18,791: INFO: common: created directory at: artifacts]
[2023-12-29 16:29:18,794: INFO: common: created directory at: artifacts/data_ingestion]
[2023-12-29 16:29:18,796: INFO: 2891326780: File already present at artifacts/data_ingestion/data.zip. File size = ~ 3754 KB]


Now let us convert this notebook into modular programming!