# Large-Scale Data Engineering for AI
## The Data Engineering Pipeline
### The Landing Zone

In [8]:
# Imports and requirements
import os
import datetime
import kaggle
import shutil
import pandas as pd

In [5]:
# Base directory and Landing Zone directory structure
BASE_DIR = "./data"
LANDING_ZONE_DIR = os.path.join(BASE_DIR, "landing_zone")
os.makedirs(LANDING_ZONE_DIR, exist_ok=True)

kaggle.api.authenticate()

# List of datasets to ingest from Kaggle
datasets = [
        {
            "kaggle_id": "asaniczka/top-spotify-songs-in-73-countries-daily-updated",
            "dataset_name": "top-spotify-songs-by-country",
            "update": True
        },
        {
            "kaggle_id": "maharshipandya/-spotify-tracks-dataset",
            "dataset_name": "spotify-tracks-dataset",
            "update": False
        },
        {
            "kaggle_id": "terminate9298/songs-lyrics",
            "dataset_name": "songs-lyrics",
            "update": False
        }
        ]



In [9]:
# Logging function to timestamp each message
def log(message):
    print(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}")

def data_collector_kaggle(kaggle_dataset: dict) -> None:
    """
    Downloads a dataset from Kaggle and saves it to the landing zone.

    Parameters:
    kaggle_dataset (dict): A dictionary containing the Kaggle dataset information.
    """

    # Extract dataset information
    kaggle_id = kaggle_dataset["kaggle_id"]
    dataset_name = kaggle_dataset["dataset_name"]

    # Create a temporary directory for the dataset using the actual dataset name
    dataset_folder = os.path.join(LANDING_ZONE_DIR, f"temp_{dataset_name}")
    os.makedirs(dataset_folder, exist_ok=True)

    try:  
        log(f"Downloading dataset: {kaggle_id}")

        kaggle.api.dataset_download_files(
            kaggle_id,
            path=dataset_folder,
            unzip=True
        )

        csv_found = False
        for filename in os.listdir(dataset_folder):
            if filename in ['songs_details.csv', 'album_details.csv']:
                continue
            if filename.endswith(".csv"):
                csv_found = True
                csv_path = os.path.join(dataset_folder, filename)

                # Read CSV with Pandas
                df = pd.read_csv(csv_path)

                # Write as a single Parquet file
                final_path = os.path.join(LANDING_ZONE_DIR, f"{dataset_name}.parquet")
                df.to_parquet(final_path, index=False)

                log(f"CSV '{filename}' converted to single Parquet file and saved as '{final_path}'.")
                
        if not csv_found:
            log(f"No CSV file found in the downloaded dataset. Check the contents of the download.")
            
        # Remove the temporary dataset folder
        shutil.rmtree(dataset_folder)

    except Exception as e:
         # Remove the dataset folder if it exists
        if os.path.exists(dataset_folder):
            shutil.rmtree(dataset_folder)

        # Log the error
        log(f"Error downloading dataset '{kaggle_id}': {e}")
       
        return

    # Log the successful download
    log(f"Dataset '{dataset_name}' downloaded successfully.")

def download_and_store_datasets(update: bool = False) -> None:
    """
    Downloads and stores datasets from Kaggle into the landing zone.
    """
    log("Starting the creation of the Landing Zone using Kaggle API")

    for kaggle_dataset in datasets:
        if update and not kaggle_dataset["update"]:
            log(f"Skipping dataset '{kaggle_dataset['dataset_name']}' as update is set to False.")
            continue
        try:
            dataset_name = kaggle_dataset["dataset_name"]
            data_collector_kaggle(kaggle_dataset)
            log(f"Dataset '{dataset_name}' processed successfully.")
        except Exception as e:
            log(f"Error processing dataset '{dataset_name}': {e}")

    log("All datasets have been processed.")
    log("Landing Zone creation completed.")

download_and_store_datasets(update=False)

[2025-04-11 18:49:26] Starting the creation of the Landing Zone using Kaggle API
[2025-04-11 18:49:26] Downloading dataset: asaniczka/top-spotify-songs-in-73-countries-daily-updated
Dataset URL: https://www.kaggle.com/datasets/asaniczka/top-spotify-songs-in-73-countries-daily-updated
[2025-04-11 18:49:45] CSV 'universal_top_spotify_songs.csv' converted to single Parquet file and saved as './data/landing_zone/top-spotify-songs-by-country.parquet'.
[2025-04-11 18:49:46] Dataset 'top-spotify-songs-by-country' downloaded successfully.
[2025-04-11 18:49:46] Dataset 'top-spotify-songs-by-country' processed successfully.
[2025-04-11 18:49:46] Downloading dataset: maharshipandya/-spotify-tracks-dataset
Dataset URL: https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset
[2025-04-11 18:49:48] CSV 'dataset.csv' converted to single Parquet file and saved as './data/landing_zone/spotify-tracks-dataset.parquet'.
[2025-04-11 18:49:48] Dataset 'spotify-tracks-dataset' downloaded succes

### The Trusted Zone

### The Exploitation Zone

## The Data Analysis Pipelines