In [1]:
!ls

data-ingestion-notebook-research.ipynb


In [2]:
import os

In [3]:
%pwd

'/Users/priyank/Desktop/Job-Resume/Interview-Preparation/MLOPs/DataScienceFullStack/research'

In [4]:
os.chdir("../")
%pwd

'/Users/priyank/Desktop/Job-Resume/Interview-Preparation/MLOPs/DataScienceFullStack'

In [8]:
# Input to our data ingestion pipeline
# Update in config_entity.py file
from dataclasses import dataclass
from pathlib import Path

"""Data Class is used when there are no functions in our class 
and we just need to assign values without self keyword"""

@dataclass
class DataIngestionConfig:
        root_dir: Path 
        source_URL: str
        local_data_file: Path
        unzip_dir: Path

In [12]:
# Used to manage or load any updates in config, schema and params .yaml
# Update in configuration.py 

from src.datascienceFullStackProject.constants import * 
from src.datascienceFullStackProject.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self, 
            config_filepath= CONFIG_FILE_PATH, 
            params_filpath = PARAMS_FILE_PATH, 
            schema_filepath = SCHEMA_FILE_PATH):

            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filpath)
            self.schema = read_yaml(schema_filepath)

            create_directories([self.config.artifacts_root])


    def get_data_ingestion_config(self)-> DataIngestionConfig:
            config = self.config.data_ingestion
            create_directories([config.root_dir]) 

            data_ingestion_config = DataIngestionConfig(
                root_dir =  config.root_dir, 
                source_URL = config.source_URL,
                local_data_file = config.local_data_file,
                unzip_dir = config.unzip_dir
                )

            return data_ingestion_config        

In [13]:
import os
import urllib.request as request
from src.datascienceFullStackProject import logger
import zipfile

## Data Ingestion Component
class DataIngestion:
    
    def __init__(self,config:DataIngestionConfig):
        self.config=config

    # Download the zip file
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        
        else:
            logger.info(f"File already exists")

    # Extract the zip file
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)




In [14]:
# Data Ingestion Pipeline Run: 
try:
    config=ConfigurationManager()
    data_ingestion_config=config.get_data_ingestion_config()
    data_ingestion=DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2024-10-21 12:53:01,624: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-10-21 12:53:01,626: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-21 12:53:01,628: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-10-21 12:53:01,630: INFO: common: created directory at: artifacts]
[2024-10-21 12:53:01,632: INFO: common: created directory at: artifacts/data_ingestion]
[2024-10-21 12:53:02,209: INFO: 1733211362: artifacts/data_ingestion/data.zip download! with following info: 
Connection: close
Content-Length: 23329
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "c69888a4ae59bc5a893392785a938ccd4937981c06ba8a9d6a21aa52b4ab5b6e"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: EBE3:3C56CB:B30EB:C1F1C:671686ED
Accept-Ranges: bytes
Date: Mon, 21 O