In [1]:
import os

In [2]:
%pwd

'c:\\Users\\15512\\end-to-end-ml-project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\15512\\end-to-end-ml-project'

In [5]:
from dataclasses import dataclass
from pathlib import Path


# these are the variable we mentioned in config.yaml under data_ingestion
# now here we are returning the each variable data type
@dataclass(frozen =True) #frozen = True helps in not allowing to add any other variable names in this class
class DataIngestionConfig:
    root_dir: Path #artifacts/data_ingestion is the directory as value we mentioned in config.yaml
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [15]:
from src.mlProject.constants import *
from src.mlProject.utils.common import *


In [19]:
"""
this is a configurationmanager class and this is used to configure all the values to 
data_ingestion, data_validation and other components too.

"""



CONFIG_FILE_PATH =Path('config/config.yaml')

class ConfigurationManager:
    def __init__(
        self):

        self.config = read_yaml(CONFIG_FILE_PATH)
        self.params = read_yaml(PARAMS_FILE_PATH)
        self.schema = read_yaml(SCHEMA_FILE_PATH)

        create_directories([self.config.artifacts_root])
        

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config


In [20]:
import os
import urllib.request as request
import zipfile
from mlProject import logger
from mlProject.utils.common import *


In [21]:
class DataIngestion:
    def __init__(self, config:DataIngestionConfig):
        self.config = config

    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers =  request.urlretrieve(url=self.config.source_URL,
                                                     filename = self.config.local_data_file)
            logger.info(f"{filename} downloaded with following info:\n{headers}")
        else:
            logger.info(f"file already exists of size:{get_size(Path(self.local_data_file))}")

    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir 
        os.makedirs(unzip_path,exist_ok=True)  
        with zipfile.ZipFile(self.config.local_data_file,'r') as zip_ref:
            zip_ref.extractall(unzip_path)






In [22]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e


[2024-09-11 21:26:31,906: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-11 21:26:31,910: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-11 21:26:31,914: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-09-11 21:26:31,918: INFO: common: created directory at: artifacts]
artifacts
[2024-09-11 21:26:31,921: INFO: common: created directory at: artifacts/data_ingestion]
[2024-09-11 21:26:33,592: INFO: 1224032372: artifacts/data_ingestion/data.zip downloaded with following info:
Connection: close
Content-Length: 25174
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "c6efd35d99da4ee25f6ac533e273be03f3a23d1ef6c4708f1dbc19b71cc8ba53"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 421C:2222EF:5EF97E:68DAA3:66E24349
Accept-Ranges: bytes
Dat