In [1]:
import os

In [2]:
%pwd

'c:\\Users\\etrou\\OneDrive\\Desktop\\SE489GroupProjectGit\\group_project_se489\\notebooks'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\etrou\\OneDrive\\Desktop\\SE489GroupProjectGit\\group_project_se489'

In [5]:
import cProfile
import pstats
import logging.handlers
import queue
import threading
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio
import asyncio
import os
import zipfile
import gdown
from se489_group_project import logger
from se489_group_project.utility.common import get_size

In [6]:
# # Create a thread-safe queue for log messages
# log_queue = queue.Queue(-1)  # No size limit

# # Create a QueueHandler to send log messages to the queue
# queue_handler = logging.handlers.QueueHandler(log_queue)

# # Set up the root logger to use the QueueHandler
# logger = logging.getLogger()
# logger.setLevel(logging.DEBUG)
# logger.addHandler(queue_handler)

# # Create a handler for console output (or file output)
# console_handler = logging.StreamHandler()
# console_handler.setLevel(logging.DEBUG)
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# console_handler.setFormatter(formatter)

# # Create a QueueListener to process log messages from the queue
# listener = logging.handlers.QueueListener(log_queue, console_handler)

# # Start the listener thread
# listener.start()
# #made slower

In [7]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class GettingDataConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [8]:
from se489_group_project.constants import *
from se489_group_project.utility.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.data_storage])

    def get_data_ingestion_config(self) -> GettingDataConfig:
        config = self.config.data_ingestion
        self.executor = ThreadPoolExecutor(max_workers=4)  # Adjust the number of workers as needed


        create_directories([config.root_dir])

        data_ingestion_config = GettingDataConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

In [10]:
class DataIngestion:
    def __init__(self, config: GettingDataConfig):
        self.config = config

    
    async def download_file(self)-> str:
        '''
        Fetch data from the url
        '''

        try: 
            dataset_url = self.config.source_URL
            zip_download_dir = self.config.local_data_file
            os.makedirs("data/raw", exist_ok=True)
            logger.info(f"Downloading data from {dataset_url} into file {zip_download_dir}")

            file_id = dataset_url.split("/")[-2]
            prefix = 'https://drive.google.com/uc?/export=download&id='
            gdown.download(prefix+file_id,zip_download_dir)

            logger.info(f"Downloaded data from {dataset_url} into file {zip_download_dir}")

        except Exception as e:
            raise e
        
    

    async def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [11]:
async def analyze(file):
    p = pstats.Stats(file)
    # Print top 10 functions sorted by cumulative time
    print("\nTop 10 functions sorted by cumulative time:")
    p.sort_stats('cumtime').print_stats(10)
    
    # Print top 10 functions sorted by total time
    print("\nTop 10 functions sorted by total time:")
    p.sort_stats('tottime').print_stats(10)
    

In [12]:
import subprocess

#added async to main function
async def main():
    
    try:
        log_dir = os.path.join(os.getcwd(), "se489_group_project", "visualizations")
        file = os.path.join(log_dir, 'cprofile_stats_data_ingestion.prof')
        config = ConfigurationManager()
        data_ingestion_config = config.get_data_ingestion_config()
        data_ingestion = DataIngestion(config=data_ingestion_config)
        # await data_ingestion.download_file()
        # await data_ingestion.extract_zip_file()
        profiler = cProfile.Profile()
        profiler.enable()
            
        #Profile the download_file function
        logger.info("Profiling download_file()")
        await data_ingestion.download_file()

        # Profile the extract_zip_file function
        logger.info("Profiling extract_zip_file()")
        await data_ingestion.extract_zip_file()
        profiler.disable()
        profiler.dump_stats(file)

        profile_file_full_path = os.path.abspath(file)
        await analyze(profile_file_full_path)
        #Automatically open snakeviz to visualize the profiling results
        try:
            subprocess.Popen(["snakeviz", profile_file_full_path])
        except FileNotFoundError:
            print("snakeviz is not installed or not found in the system path.")
    except Exception as e:
        logger.error(f"Error: {e}") # Log the error
        raise e
    # finally:
    #     # Stop the listener thread
    #     listener.stop()
if __name__ == "__main__":
    nest_asyncio.apply()
    asyncio.run(main())
    #main()

[2024-06-12 06:54:39,940: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-12 06:54:39,943: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-12 06:54:39,944: INFO: common: created directory at: data]
[2024-06-12 06:54:39,947: INFO: common: created directory at: data/raw]
[2024-06-12 06:54:39,947: INFO: 4008795092: Profiling download_file()]
[2024-06-12 06:54:39,949: INFO: 2982213194: Downloading data from https://drive.google.com/file/d/1gesLApompvvnzz-AWyWM4ikmk7BOGWAp/view?usp=sharing into file data/raw/data.zip]


Downloading...
From (original): https://drive.google.com/uc?/export=download&id=1gesLApompvvnzz-AWyWM4ikmk7BOGWAp
From (redirected): https://drive.google.com/uc?%2Fexport=download&id=1gesLApompvvnzz-AWyWM4ikmk7BOGWAp&confirm=t&uuid=7ecd47d8-6e93-4e32-abcb-439e1fc8fe3b
To: c:\Users\etrou\OneDrive\Desktop\SE489GroupProjectGit\group_project_se489\data\raw\data.zip
100%|██████████| 57.7M/57.7M [00:06<00:00, 8.44MB/s]

[2024-06-12 06:54:47,605: INFO: 2982213194: Downloaded data from https://drive.google.com/file/d/1gesLApompvvnzz-AWyWM4ikmk7BOGWAp/view?usp=sharing into file data/raw/data.zip]
[2024-06-12 06:54:47,607: INFO: 4008795092: Profiling extract_zip_file()]






Top 10 functions sorted by cumulative time:
Wed Jun 12 06:54:48 2024    c:\Users\etrou\OneDrive\Desktop\SE489GroupProjectGit\group_project_se489\se489_group_project\visualizations\cprofile_stats_data_ingestion.prof

         163978 function calls (163771 primitive calls) in 8.163 seconds

   Ordered by: cumulative time
   List reduced from 1101 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    7.659    7.659 C:\Users\etrou\AppData\Local\Temp\ipykernel_13188\2982213194.py:6(download_file)
        1    0.004    0.004    7.656    7.656 c:\Users\etrou\anaconda3\envs\tmp_env_WO\lib\site-packages\gdown\download.py:113(download)
     4477    0.013    0.000    7.319    0.002 c:\Users\etrou\anaconda3\envs\tmp_env_WO\lib\socket.py:655(readinto)
     4477    0.013    0.000    7.297    0.002 c:\Users\etrou\anaconda3\envs\tmp_env_WO\lib\ssl.py:1263(recv_into)
     4477    0.007    0.000    7.282    0.002 c:\Users\e