In [1]:
import os


In [2]:
from datasets import load_dataset

%pwd

  from .autonotebook import tqdm as notebook_tqdm


'c:\\Edu\\Python\\MLOPS_projects\\mlops_nlp_summarizer\\research'

In [2]:
dataset = load_dataset('samsum')

In [9]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})


In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Edu\\Python\\MLOPS_projects\\mlops_nlp_summarizer'

In [5]:
#entity is returned type of the function
# we can create custom return type:


from dataclasses import dataclass
from pathlib import Path

# below looks similatly to the config file data ingestion
@dataclass(frozen=True)
class DataIngestionConfigUnzipLink:
    root_dir: Path
    source_URL_zipped: str
    local_data_file: Path
    unzip_dir: Path


@dataclass(frozen=True)
class DataIngestionConfigLink:
    root_dir: Path
    source_URL: str
    local_data_file: Path


@dataclass(frozen=True)
class DataIngestionConfigLibrary:
    root_dir: Path
    local_data_dir: Path
    library_dataset_name: str
    

In [6]:

#Now we need to read yaml file and we mentioned it in constants folder in constructor file 
    # we do it only once in the beginning of the project

from mlops_NLP_Text_Summarization.constants import *
from mlops_NLP_Text_Summarization.utils.common import read_yaml, create_directories


In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)



        create_directories([self.config.artifacts_root])

    def choose_type_of_data_ingestion(self):


        try:

            #for rulesingestion in self.config.rulesingestion:
            #    if rulesingestion['type'] == "link_zipped" and rulesingestion['source_URL_zipped'] is not None:
            #        return self.get_data_ingestion_config_unzip_link()
            #    if rulesingestion['type'] == "link" and rulesingestion['source_URL'] is not None:
            #        return self.get_data_ingestion_config_link()
            #    if rulesingestion['type'] == "library_huggingface_dataset" and rulesingestion['library_dataset_name'] is not None:
            #        return self.get_data_ingestion_config_library()


            if self.config.rulesingestion.data_ingestion_link_zipped.source_URL_zipped is not False:
                return self.get_data_ingestion_config_unzip_link()
            if self.config.rulesingestion.data_ingestion_link.source_URL is not False:
                return self.get_data_ingestion_config_link()
            if self.config.rulesingestion.data_ingestion_library_hugging_face_dataset.library_dataset_name is not False:
                return self.get_data_ingestion_config_library()
            else:
                raise ValueError("data ingestion type not supported")
        except Exception as e:
            print(e)
            raise Exception(e)
            



    def get_data_ingestion_config_unzip_link(self) -> DataIngestionConfigUnzipLink:
        config = self.config.rulesingestion.data_ingestion_link_zipped

        create_directories([config.root_dir])

        data_ingestion_unzip_link_config = DataIngestionConfigUnzipLink(
            root_dir=config.root_dir,
            source_URL_zipped=config.source_URL_zipped,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_unzip_link_config
    
    def get_data_ingestion_config_link(self) -> DataIngestionConfigLink:
        config = self.config.rulesingestion.data_ingestion_link

        create_directories([config.root_dir])

        data_ingestion_link_config = DataIngestionConfigLink(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            
        )

        return data_ingestion_link_config
    
    def get_data_ingestion_config_library(self) -> DataIngestionConfigLibrary:
        config = self.config.rulesingestion.data_ingestion_library_hugging_face_dataset

        create_directories([config.root_dir])

        data_ingestion_library_config = DataIngestionConfigLibrary(
            root_dir=config.root_dir,
            local_data_dir=config.local_data_dir,
            library_dataset_name=config.library_dataset_name,

            
        )

        return data_ingestion_library_config

In [8]:
import os
import urllib.request as request
import zipfile
import pandas as pd
import datasets
from mlops_NLP_Text_Summarization.logging import logger
from mlops_NLP_Text_Summarization.utils.common import get_size

In [11]:
class DataIngestionUnzippedLink:
    def __init__(self, config: DataIngestionConfigUnzipLink):
        self.config = config


    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL_zipped,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")  


    
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

class DataIngestionLink:
    def __init__(self, config: DataIngestionConfigLink):
        self.config = config


    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")  


class DataIngestionLibrary:
    def __init__(self, config: DataIngestionConfigLibrary):
        self.config = config


    
    def get_data_from_library(self):
        if not os.path.exists(self.config.local_data_dir):
            directory =  os.makedirs(self.config.local_data_dir, exist_ok=True)
            directory_train = os.makedirs(self.config.local_data_dir + "/train", exist_ok=True)
            directory_test = os.makedirs(self.config.local_data_dir + "/test", exist_ok=True)
            directory_validation = os.makedirs(self.config.local_data_dir + "/validation", exist_ok=True)
            load_train = load_dataset("samsum", split ="train")
            df_load_train = pd.DataFrame(load_train)
            df_load_train.to_csv(self.config.local_data_dir +  "/train/train.csv")
            load_test = load_dataset("samsum", split ="test")
            df_load_test = pd.DataFrame(load_test)
            df_load_test.to_csv(self.config.local_data_dir + "/test/test.csv")
            load_validation = load_dataset("samsum", split ="validation")
            df_load_validation = pd.DataFrame(load_validation)
            df_load_validation.to_csv(self.config.local_data_dir + "/validation/validation.csv")
            
            #dataset_test = load_dataset("samsum", split ="test").to_csv(self.config.local_data_dir/"test.csv"),
            #dataset_validation = load_dataset("samsum", split ="validation").to_csv(self.config.local_data_dir/"validation.csv")
        
            logger.info(f" Directory in path {self.config.local_data_dir} was created from library! with following files: \n train.csv \n  test.csv \n validation.csv ")
        else:
            logger.info(f"Folder already exists of size: {get_size(Path(self.config.local_data_dir))}")  



In [12]:
try:
    config = ConfigurationManager()
    logger.info(f"ConfigManager initialized: {config}")
    data_ingestion_selection = config.choose_type_of_data_ingestion()
    print(data_ingestion_selection)
    if data_ingestion_selection == config.get_data_ingestion_config_library():
        print("Method returned")
        logger.info(f"Data ingestion type chosen: {config.get_data_ingestion_config_library()}")
        data_ingestion_config = config.get_data_ingestion_config_library()
        logger.info(f"Data ingestion type confirmed ")
        data_ingestion = DataIngestionLibrary(config=data_ingestion_config)
        logger.info(f"Data ingestion type configured in class DataIngestionLibrary:")
        data_ingestion.get_data_from_library()
        logger.info(f"Data ingestion performed")
    else:

        print("Method did not return")
    
    #data_ingestion = DataIngestion(config=data_ingestion_config)
    #data_ingestion.download_file()
    #data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2023-08-06 15:29:02,224: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-08-06 15:29:02,226: INFO: common: yaml file: params.yaml loaded successfully]
[2023-08-06 15:29:02,228: INFO: common: created directory at: artifacts]
[2023-08-06 15:29:02,229: INFO: 1532702591: ConfigManager initialized: <__main__.ConfigurationManager object at 0x000001D747EEB820>]
[2023-08-06 15:29:02,230: INFO: common: created directory at: artifacts/data_ingestion]
DataIngestionConfigLibrary(root_dir='artifacts/data_ingestion', local_data_dir='artifacts/data_ingestion/samsum', library_dataset_name='samsum')
[2023-08-06 15:29:02,231: INFO: common: created directory at: artifacts/data_ingestion]
Method returned
[2023-08-06 15:29:02,232: INFO: common: created directory at: artifacts/data_ingestion]
[2023-08-06 15:29:02,233: INFO: 1532702591: Data ingestion type chosen: DataIngestionConfigLibrary(root_dir='artifacts/data_ingestion', local_data_dir='artifacts/data_ingestion/samsum', library_

Using the latest cached version of the module from C:\Users\Admin\.cache\huggingface\modules\datasets_modules\datasets\samsum\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e (last modified on Fri Aug  4 23:41:02 2023) since it couldn't be found locally at samsum., or remotely on the Hugging Face Hub.


[2023-08-06 15:29:08,749: INFO: 1730958069:  Directory in path artifacts/data_ingestion/samsum was created from library! with following files: 
 train.csv 
  test.csv 
 validation.csv ]
[2023-08-06 15:29:08,754: INFO: 1532702591: Data ingestion performed]
