In [1]:
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [2]:
import os
%pwd

'e:\\projects_final file\\Text  Summarization\\Text-Summarization-Project\\research'

In [3]:
research_directory = r"E:\projects_final file\Text  Summarization\Text-Summarization-Project\research"

In [4]:
os.chdir(research_directory)

In [5]:
%pwd


'E:\\projects_final file\\Text  Summarization\\Text-Summarization-Project\\research'

To move whole work in main directory

In [6]:
os.chdir('../') 

In [7]:
%pwd

'E:\\projects_final file\\Text  Summarization\\Text-Summarization-Project'

## 1.Update config.yaml file:


artifacts_root: artifacts

#data dowloading and making path for saving it

data_ingestion:

  root_dir:artifacts/data_ingestion

  source_URL:https://github.com/ShalinVachheta017/Text-Summarization-Project/raw/main/summarizer-data.zip 

  local_data_file:artifacts/data_ingestion/data.zip
  
  unzip_dir = artifacts/data_ingestion/


## 2. Update params.yaml file ..skip for now

## 3. Update entity:

In [9]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

## 4.Update configuration manager in src config

In [10]:
from textSummarizer.constants import * #to import necessary constants
from textSummarizer.utils.common import read_yaml, create_directories #to read yaml file and create directories

The ConfigurationManager class is responsible for reading configuration and parameter files, creating necessary directories, and providing configuration details for data ingestion. 

It initializes with file paths for configuration and parameters, reads these files, and sets up directories as specified in the configuration.

In [11]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

## 5.Update the components

In [12]:
import os
import urllib.request as request
import zipfile
from textSummarizer.logging import logger
from textSummarizer.utils.common import get_size

In [13]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    # download_file: Downloads the data file from the source URL to the local directory
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_file
            )
            logger.info(
                f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(
                f"File already exists of size: {get_size(Path(self.config.local_data_file))}")  
            
    #extract_zip_file: Extracts the zip file into the data directory
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [14]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.extract_zip_file()
    
except Exception as e:
    raise e

[2024-07-13 19:20:01,635: INFO: common: YAML file: config\config.yaml loaded successfully]
[2024-07-13 19:20:01,639: INFO: common: YAML file: params.yaml loaded successfully]
[2024-07-13 19:20:01,641: INFO: common: Created directory at: artifacts]
[2024-07-13 19:20:01,643: INFO: common: Created directory at: artifacts/data_ingestion]
[2024-07-13 19:20:01,644: INFO: 2243453559: File already exists of size: ~ 7718 KB]
