In [1]:
from collections import namedtuple
import os
os.chdir("../")

In [2]:
DataIngestionConfig = namedtuple("DataIngestionConfig",[
    "root_dir",
    "source_URL",
    "local_data_file",
    "unzip_dir"
])

In [3]:
from src.forest.constant import CONFIG_FILE_PATH
from src.forest.utils.main_utils import MainUtils
from src.forest.logger import logging
import shutil,os
import pandas as pd
from zipfile import ZipFile

In [4]:
class ConfigurationManager:
    def __init__(self,config_filepath = CONFIG_FILE_PATH):
        self.config = MainUtils.read_yaml_file(MainUtils, filename=config_filepath)

    def get_data_ingestion_config(self) -> DataIngestionConfig:

        config = self.config['data_ingestion']

        MainUtils.create_directories([config["root_dir"]])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config["root_dir"],
            source_URL = config["source_URL"],
            local_data_file = config["local_data_file"],
            unzip_dir = config["unzip_dir"]
        )

        return data_ingestion_config

In [5]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def download_file(self):
        print(self.config)
        if not os.path.exists(self.config.local_data_file):
            logging.info("Download started...")
            shutil.copy(self.config.source_URL, self.config.root_dir)
        else:
            logging.info(f"File already exists")
    
    def extract_zipfile(self):
        file = ZipFile(self.config.local_data_file)
        if not os.path.exists(self.config.unzip_dir):
            file.extractall(path=self.config.unzip_dir)
        else:
            logging.info(f"File already exists of size")

    def get_dataframe(self):
        target_filepath = os.path.join(self.config.unzip_dir+'/covtype.csv')
        df = pd.read_csv(target_filepath)
        return df

    


In [6]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion( config= data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zipfile()
    data_ingestion.get_dataframe()
except Exception as e:
    raise e

DataIngestionConfig(root_dir='artifacts/data_ingestion', source_URL='covtype.zip', local_data_file='artifacts/data_ingestion/covtype.zip', unzip_dir='artifacts/data_ingestion')
artifacts/data_ingestion/covtype.csv
(581012, 55)
