In [9]:
import os
import re
import requests
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from box import ConfigBox
from pydantic import BaseModel
import matplotlib.pyplot as plt
from box.exceptions import BoxValueError

sns.set_style('darkgrid')
%matplotlib inline

In [2]:
# Url for downlaoding data
url = "https://archive.ics.uci.edu/static/public/2/adult.zip"

# Directory to store downloaded data
data_dir = "data"

In [3]:
# yaml config file to store file paths
config_file = "config.yaml"

In [4]:
%%writefile {config_file}
data_ingestion:
    # Path to store all data
    data_dir: data
    url: https://archive.ics.uci.edu/static/public/2/adult.zip
    raw_zip_file: data/adult.zip
    # Path to store files after unzipping
    target_dir: data

Overwriting config.yaml


In [5]:
utilities = "utils.py"

In [6]:
%%writefile {utilities}
# Read the yaml file to get paths
import os
import yaml
from pathlib import Path
from box import ConfigBox
from box.exceptions import BoxValueError

def read_yaml(path_to_yaml_file: Path)-> ConfigBox:
    """reads a yaml file.

    Args:
        path_to_yaml_file (str): path-like input

    Raises:
        ValueError: if yaml file is empty
        e: empty file

    Returns:
        ConfigBox: ConfigBox type
    """
    try:
        with open(path_to_yaml_file, 'r') as file:
            data = yaml.safe_load(file)
            print(f"yaml file: {path_to_yaml_file} loaded successfully")
    except BoxValueError:
        raise ValueError("Yaml File is empty")
    except Exception as e:
        raise e
     
    return ConfigBox(data)

# Create neccessary directory(ies)
def create_directories(directory_names: list, verbose = True ):
    """create list of directories

    Args:
        path_to_directories (list): list of path of directories
        verbose (bool, optional): ignore if multiple dirs is to be created. Defaults to False.
    """
    for path in directory_names:
        os.makedirs(path, exist_ok = True)
        if verbose:
            print(f"created directory at {path}")

Overwriting utils.py


In [7]:
from utils import *


class DataIngestionConfigValidator(BaseModel):
    data_dir: Path
    url: str
    raw_zip_file: Path #Path #data_dir:
    target_dir: Path

class DataIngestion:
    def __init__(self, config_path):
        self.config_path = config_path

    def validate_data(self):
        configbox = read_yaml(Path(self.config_path))
        config = configbox.data_ingestion
        config = DataIngestionConfigValidator(**config)
        return config
        
    def download_file(self, config):
        """
        Url: str
        downloads data into a specified directory
        config (ConfigBox): yaml file that contains file path

        Returns None
        """
        create_directories([config.data_dir])
        
        if not os.path.exists(config.raw_zip_file):
            response = requests.get(config.url)
            
            with open(config.raw_zip_file, 'wb') as file:
                file.write(response.content)
                
        else:
            print(f"File already exists in {config.raw_zip_file}")
        
        
    def unzip_file(self, config):
        """
            zip_file_path: str
            Extracts the zip file into the data directory
            Function returns None
            """
        with zipfile.ZipFile(config.raw_zip_file, 'r') as files:
            files.extractall(config.target_dir)

In [10]:
data = DataIngestion("config.yaml")
config = data.validate_data() 
data.download_file(config)
data.unzip_file(config)

yaml file: config.yaml loaded successfully
created directory at data
File already exists in data\adult.zip


In [4]:
ingestion_utils = "data_ingestion/ingestion.py"

In [6]:
%%writefile {ingestion_utils}
import requests
import zipfile
from utils import *
class DataIngestionConfigValidator(BaseModel):
    data_dir: Path
    url: str
    raw_zip_file: Path #Path #data_dir:
    target_dir: Path

class DataIngestion:
    def __init__(self, config_path):
        self.config_path = config_path

    def validate_data(self):
        configbox = read_yaml(Path(self.config_path))
        config = configbox.data_ingestion
        config = DataIngestionConfigValidator(**config)
        return config
        
    def download_file(self, config):
        """
        Url: str
        downloads data into a specified directory
        config (ConfigBox): yaml file that contains file path

        Returns None
        """
        create_directories([config.data_dir])
        
        if not os.path.exists(config.raw_zip_file):
            response = requests.get(config.url)
            
            with open(config.raw_zip_file, 'wb') as file:
                file.write(response.content)
                
        else:
            print(f"File already exists in {config.raw_zip_file}")
        
        
    def unzip_file(self, config):
        """
            zip_file_path: str
            Extracts the zip file into the data directory
            Function returns None
            """
        with zipfile.ZipFile(config.raw_zip_file, 'r') as files:
            files.extractall(config.target_dir)

Overwriting data_ingestion/ingestion.py
