In [1]:
%pwd

'e:\\Additional Projects\\ML Projects for Resume\\text-summarization-english-end-to-end-project\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'e:\\Additional Projects\\ML Projects for Resume\\text-summarization-english-end-to-end-project'

In [4]:
from dataclasses import dataclass
from pathlib import Path

In [5]:
#entity
@dataclass
class DataIngestionConfig:
    root_dir : Path
    source_url : str
    local_data_file : Path
    unzip_dir : Path

In [16]:
from textSummarizer.constant import *
from textSummarizer.utils.common import *


#ConfigurationManager
class ConfigurationManager:
    def __init__(self, 
                 config_file_path = CONFIG_FILE_PATH,
                 params_file_path = PARAMS_FILE_PATH
                 ):
        self.config = read_yaml_file(config_file_path)
        self.params = read_yaml_file(params_file_path)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_url = config.source_url,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )
        
        return data_ingestion_config
        

In [17]:
cg = ConfigurationManager()

[2024-04-27 11:00:54,690 INFO root common 17 - yaml file- config\config.yaml is loaded successfully]
[2024-04-27 11:00:54,699 INFO root common 17 - yaml file- params.yaml is loaded successfully]


Directory created: artifacts


In [22]:
cfg = cg.get_data_ingestion_config()

Directory created: artifacts/data_ingestion


In [19]:
#Component
import urllib.request as request
import zipfile

class DataIngestion:
    def __init__(self, config = DataIngestionConfig) -> None:
        self.config = config
    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_url,
                filename= self.config.local_data_file
            )
            logging.info(f"{filename} downloaded with following info: \n{headers}")
        else:
            logging.info(f"File already exist")
    
    def extract_zip_file(self):
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok= True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_file:
            zip_file.extractall(unzip_path)

In [23]:
data_ingestion = DataIngestion(cfg)
data_ingestion.download_file()

[2024-04-27 11:23:35,573 INFO root 1140836670 15 - artifacts/data_ingestion/data.zip downloaded with following info: 
Connection: close
Content-Length: 7903594
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "dbc016a060da18070593b83afff580c9b300f0b6ea4147a7988433e04df246ca"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 3498:20ACEC:167AFD:1E0BA9:662C886B
Accept-Ranges: bytes
Date: Sat, 27 Apr 2024 05:23:21 GMT
Via: 1.1 varnish
X-Served-By: cache-hyd1100033-HYD
X-Cache: HIT
X-Cache-Hits: 0
X-Timer: S1714195401.946589,VS0,VE259
Vary: Authorization,Accept-Encoding,Origin
Access-Control-Allow-Origin: *
Cross-Origin-Resource-Policy: cross-origin
X-Fastly-Request-ID: 96980a85207a37da641a48208fe9df1ecdd6f88a
Expires: Sat, 27 Apr 2024 05:28:21 GMT
Source-Age: 0

]


In [24]:
data_ingestion.extract_zip_file()

# Pipeline

In [28]:
class DataIngestionPipeline:
    def __init__(self):
        pass
    
    def main(self):
        try:
            config = ConfigurationManager()
            data_ingestion_config = config.get_data_ingestion_config()
            data_ingestion = DataIngestion(data_ingestion_config)
            data_ingestion.download_file()
            data_ingestion.extract_zip_file()
        except Exception as e:
            raise e

In [29]:
data_ingestion = DataIngestionPipeline()
data_ingestion.main()

[2024-04-27 16:31:29,783 INFO root common 17 - yaml file- config\config.yaml is loaded successfully]
[2024-04-27 16:31:29,783 INFO root common 17 - yaml file- params.yaml is loaded successfully]


Directory created: artifacts
Directory created: artifacts/data_ingestion


[2024-04-27 16:31:46,839 INFO root 1140836670 15 - artifacts/data_ingestion/data.zip downloaded with following info: 
Connection: close
Content-Length: 7903594
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: application/zip
ETag: "dbc016a060da18070593b83afff580c9b300f0b6ea4147a7988433e04df246ca"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Options: deny
X-XSS-Protection: 1; mode=block
X-GitHub-Request-Id: 56E6:215821:193AA3:21C435:662CD403
Accept-Ranges: bytes
Date: Sat, 27 Apr 2024 10:31:32 GMT
Via: 1.1 varnish
X-Served-By: cache-hyd1100025-HYD
X-Cache: MISS
X-Cache-Hits: 0
X-Timer: S1714213892.202262,VS0,VE714
Vary: Authorization,Accept-Encoding,Origin
Access-Control-Allow-Origin: *
Cross-Origin-Resource-Policy: cross-origin
X-Fastly-Request-ID: 5c69c5acb8062d6dae4652c055785d996fea2b54
Expires: Sat, 27 Apr 2024 10:36:32 GMT
Source-Age: 0

]
