# Setup

In [5]:
import os
import logging
import json
from dataclasses import dataclass
from dotenv import load_dotenv
import requests
from typing import Dict, Optional, Tuple, List, Union
import polars as pl
import pandas as pd
from pathlib import Path
from collections import defaultdict
from datetime import datetime, timedelta, timezone

In [6]:
# Hauptordner erstellen
os.makedirs('./data', exist_ok=True)

# Unterordner erstellen
for sub_dir in ['raw_data', 'transformed_data', 'gaps_data', 'logs']:
    os.makedirs(os.path.join('../data', sub_dir), exist_ok=True)

In [7]:
def setup_logging():
    """Konfiguriert das Logging"""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler("./logs/pipeline.log"),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger('pipeline')

# Classes

In [8]:
@dataclass
class Config:
    base_url: str = "https://api.hoppe-sts.com/"
    raw_path: str = "./data/raw_data"
    transformed_path: str = "./data/transformed_data"
    gaps_path: str = "./data/gaps_data"  # Neuer Pfad für Null-Wert-Lücken
    batch_size: int = 1000
    max_workers: int = 8  # Erhöhte Worker für bessere Parallelisierung
    days_to_keep: int = 90  # Daten werden für 90 Tage aufbewahrt
    history_days: int = 5  # Letzten 5 Tage für Historie laden

In [9]:
class API_Client:

    def __init__(self, base_url: str, api_key: str):
        self.base_url = base_url
        self.api_key = api_key
        self.logger = logging.getLogger('API Client')

    def get_data(self, relative_url):
        
        try:
            request_url = f"{self.base_url}{relative_url}"
            response = requests.request("GET", request_url, headers={"Authorization": f"ApiKey {self.api_key}"})
            self.logger.info(f"Request for {relative_url} successful")
            return response, response.json()
        except requests.exceptions.SSLError as e:
            self.logger.error(f"SSL-Zertifikatsfehler: {str(e)}")
            return None, None
        except requests.exceptions.Timeout as e:
            self.logger.error(f"Timeout bei API-Anfrage: {str(e)}")
            return None, None
        except requests.exceptions.ConnectionError as e:
            self.logger.error(f"Verbindungsfehler: {str(e)}")
            return None, None
        except requests.exceptions.RequestException as e:
            self.logger.error(f"API request failed: {str(e)}")
            if hasattr(e, 'response'):
                return e.response, None
            return None, None

In [10]:
class Data_Storage:
    
    
    def __init__(self, config):
        self.config = config
        self.logger = logging.getLogger('Data Storage')
        
    # Schreiben von Files in lokale Ordner    
    def write_file(self, data: Union[List, Dict, pl.DataFrame], filename: str, path: str, postfix: str) -> None:
        os.makedirs(path, exist_ok=True)
        full_path = f"{path}/{filename}.{postfix}"
        
        try:
            # Schreibt json files
            if postfix == 'json':
                with open(full_path, 'w') as f:
                    json.dump(data, f)
                self.logger.info(f"Writting to {filename}.json file successfully")

            # Schreibt parquet files
            elif postfix == 'parquet':

                # Check auf richtiges Input-Format
                if not isinstance(data, pl.DataFrame):
                    if isinstance(data, list) or isinstance(data, dict):
                        data = pl.DataFrame(data)
                    else:
                        raise ValueError("Data must be DataFrame, List, or Dict for parquet format")
                    
                data.write_parquet(full_path, compression="snappy")
                self.logger.info(f"Writting to {filename}.parquet file successfully")

            else:
                raise ValueError(f"Unsupported format: {postfix}")
                
            self.logger.info(f"Data saved to {full_path}")
        except Exception as e:
            self.logger.error(f"Failed to write file {full_path}: {str(e)}")
            raise


    def read_file(self, filename: str, path: str, postfix: str) -> pl.DataFrame:
        full_path = f"{path}/{filename}.{postfix}"
        
        try:
            if not os.path.exists(full_path):
                self.logger.info(f"File {full_path} does not exist")
                return pl.DataFrame()  # Leeres DataFrame zurückgeben
            
            else:
                if postfix == 'json':
                    with open(full_path, 'r') as f:
                        data = json.load(f)
                        data = pl.DataFrame(data)
                    self.logger.info(f"Reading from {filename}.json file successfully")
                    return data

                elif postfix == 'parquet':
                    data = pl.read_parquet(full_path)
                    self.logger.info(f"Reading from {filename}.parquet file successfully")
                    return data

                else:
                    raise ValueError(f"Unsupported format: {postfix}")
            
        except Exception as e:
            self.logger.error(f"Failed to read file {full_path}: {str(e)}")
            raise

        
    def find_timeseries_files(self, base_path: str, max_days: int = None, pattern:str = "Timeseries_*.parquet") -> defaultdict:
        base_dir = Path(base_path)
        if not base_dir.exists() or not base_dir.is_dir():
            self.logger.error(f"Directory {base_path} does not exist or is no directory")
            return defaultdict(list)
    
        try:
            days_found = 0
            months_found = 0
            years_found = 0
            imos_found = 0
            files_found = 0

            # Dictionary für Partitionierung nach imo
            files_by_imo = defaultdict(list)
            
            # Alle Jahre-Ordner sortiert durchsuchen (neuste zuerst)
            for year_dir in sorted(base_dir.iterdir(), key=lambda x: x.name, reverse=True):
                if not year_dir.is_dir():
                    continue
                
                # Alle Monate-Ordner sortiert durchsuchen (neuste zuerst)
                for month_dir in sorted(year_dir.iterdir(), key=lambda x: x.name, reverse=True):
                    if not month_dir.is_dir():
                        continue
                    
                    # Alle Tage-Ordner sortiert durchsuchen (neuste zuerst)
                    days_processed = 0
                    for day_dir in sorted(month_dir.iterdir(), key=lambda x: x.name, reverse=True):
                        if not day_dir.is_dir():
                            continue
                        
                        # Begrenzung auf max_days
                        if max_days is not None and days_processed >= max_days:
                            return files_by_imo  # Sofort zurückgeben, sobald Grenze erreicht
                        
                        # Alle Dateien für diesen Tag sammeln
                        for file in day_dir.rglob(pattern):
                            imo = file.stem.split("_")[1]  # Extrahiert <imo> aus "Timeseries_<imo>.parquet"
                            files_by_imo[imo].append(file)
                            files_found +=1
                        
                        days_processed += 1
                        days_found += 1
                    
                    months_found += 1

                years_found += 1

            imos_found = len(files_by_imo)

        except Exception as e:
            self.logger.error(f"Failed to get historical Data: {str(e)}")
            raise

        self.logger.info(f"{files_found} files found: {imos_found} different ships, {days_found} days, {months_found} months, {years_found} years")
        return files_by_imo  # Dictionary mit Listen von Dateien nach IMO
    
    def find_timeseries_summaries(self, base_path: str,  pattern:str = "*.parquet") -> list:
        base_dir = Path(base_path)
        if not base_dir.exists() or not base_dir.is_dir():
            self.logger.error(f"Directory {base_path} does not exist or is no directory")
            return defaultdict(list)

        # Alles Files mit dem pattern finden, pattern kann Datum begrenzen z.B. 2025*.parquet = alle Dateien aus 2025
        try:
            
            files =  []
            files_found = 0

            for file in base_dir.rglob(pattern):
                files.append(file)
                files_found +=1

        except Exception as e:
            self.logger.error(f"Failed to get historical Data: {str(e)}")
            raise

        self.logger.info(f"{files_found} summary files found")
    
        return files
        


In [11]:
class Data_Processor:
    #logger = logging.getLogger("Data_Processor") ???

    @staticmethod
    def get_imo_numbers(data: List[dict]) -> List[str]:
        return [ship['imo'] for ship in data if ship.get('active', True)]
    
    @staticmethod
    def transform_shipdata(shipdata: pl.DataFrame, run_timestamp: str) -> Tuple[pl.DataFrame, Dict[str, pl.DataFrame]]:
        shipdata = shipdata.unnest("data")
        
        # Verschachtelte Tabellen extrahiren
        tables = {}
        for column, dtype in shipdata.collect_schema().items():
            if dtype == pl.List(pl.Struct):
                tables[column] = (
                    shipdata.select("imo", column)
                    .explode(column)
                    .unnest(column)
                    .with_columns(
                        pl.lit(run_timestamp).alias("loaddate")
                    )
                    
                )
            elif dtype == pl.List:
                tables[column] = (
                    shipdata.select("imo", column)
                    .explode(column)
                    .with_columns(
                        pl.lit(run_timestamp).alias("loaddate")
                    )
                    
                )

        # Schiffsdaten ohne Verschachtelung extrahieren
        shipdata = shipdata.select(
            pl.exclude([col for col, dtype in shipdata.collect_schema().items() if dtype == pl.List])
        ).with_columns(
            pl.lit(run_timestamp).alias("loaddate")
        )

        return shipdata, tables

    
    
    @staticmethod
    def transform_signals(signals: pl.DataFrame, run_timestamp: str) -> pl.DataFrame:
        if len(signals) == 0:
            return signals

        # Initiale Transformation    
        signals = (
            signals.unnest("signals")
            .unpivot(index="imo", variable_name="signal")
            .unnest("value")
        )

        # Verbleibende Verschachtelungen plätten
        for column, dtype in signals.collect_schema().items():
            if dtype == pl.Struct:
                signals = signals.unnest(column)

        # Null-Werte
        for column, dtype in signals.collect_schema().items():
            if dtype == pl.Null:
                signals = signals.with_columns(pl.col(column).cast(pl.String))
        
        # Das Lade-Datum hinzufügen
        signals = signals.with_columns(
            pl.lit(run_timestamp).alias("loaddate")
        )
                
        return signals
    

        
    
    @staticmethod
    def transform_timeseries(timeseries: pl.DataFrame, imo: str, run_timestamp: str) -> Tuple[pl.DataFrame, pl.DataFrame]:
        
        if len(timeseries) == 0:
            return timeseries, pl.DataFrame()
        
        # Initiale Transformation
        transformed = (
            timeseries.drop("timestamp")
            .unpivot(variable_name="signal")
            .unnest("value")
            .unpivot(
                index="signal",
                variable_name="signal_timestamp",
                value_name="signal_value",
            )
            .with_columns(
                pl.lit(imo).alias("imo"),
                pl.lit(run_timestamp).alias("loaddate")
            )
        )
        
        # Lücken (NULL-Werte) identifizieren
        gaps = (
            transformed
            .filter(pl.col("signal_value").is_null())
            .select(["imo", "signal", "signal_timestamp", "loaddate"])
            .with_columns(
                pl.col("signal_timestamp").alias("gap_start")
            )
        )
        
        # NULL-Werte aus dem Hauptdatensatz entfernen
        data = transformed.filter(pl.col("signal_value").is_not_null())
        
        return data, gaps
    
    @staticmethod
    def process_gaps(gaps_df: pl.DataFrame) -> pl.DataFrame:
        
        if len(gaps_df) == 0:
            return pl.DataFrame()
            
        # Gruppiere nach IMO und Signal, sortiere nach Zeitstempel
        result = []

        # Gruppieren nach IMO und Signal
        for (imo, signal), group in gaps_df.group_by(["imo", "signal"]):
            group = group.sort("gap_start")

            current_start = group["gap_start"][0]
            prev_time = current_start

            for row in group[1:]:  # Iteriere ab zweitem Eintrag
                curr_time = row["gap_start"]

                # Prüfe, ob mehr als 5 Minuten zwischen zwei Einträgen liegen
                max_sec = 5*60
                if (curr_time - prev_time).total_seconds() > max_sec:
                    result.append({
                        "imo": imo,
                        "signal": signal,
                        "gap_start": current_start,
                        "gap_end": prev_time,
                        "loaddate": row["loaddate"]
                    })
                    current_start = curr_time  # Starte neue Lücke

                prev_time = curr_time  # Aktualisiere den vorherigen Zeitstempel
            
            # Letzte Lücke hinzufügen
            result.append({
                "imo": imo,
                "signal": signal,
                "gap_start": current_start,
                "gap_end": prev_time,
                "loaddate": group["loaddate"][-1]
                    })

        
        return pl.DataFrame(result)
    
    @staticmethod
    def enrich_timeseries_with_friendly_names(timeseries_df: pl.DataFrame, signals_df: pl.DataFrame) -> pl.DataFrame:

        if len(timeseries_df) == 0 or len(signals_df) == 0:
            return timeseries_df
            
        # Extrahiere Signal-Mapping (signal -> friendly_name)
        signal_mapping = (
            signals_df
            .filter(pl.col("friendly_name").is_not_null())
            .select(["signal", "friendly_name"])
            .unique()
        )
        
        # Join mit Timeseries-Daten
        return timeseries_df.join(
            signal_mapping,
            on="signal",
            how="left"
        )
    
    @staticmethod
    def update_daily_timeseries_summary(hist_df: pl.DataFrame, daily_df: pl.DataFrame, current_df: pl.DataFrame) -> pl.DataFrame:
        
        
        combined_df = pl.concat([hist_df, daily_df, current_df])
        summary_df = combined_df.unique(subset=["imo", "timestamp", "friendly_name"], keep="first").filter(pl.col("tag")=="new"| pl.col("tag")=="today")

        return summary_df


Eine Datei, in der alle neuen einträge des Tages gespeichert sind im verzeichnis data/daily_summary/jahrmonattag.parquet

Abläufe Pipeline:
IMO Liste und Signals Liste anlegen?
ShipData
    - max. einmal täglich
    1. API
    2. JSON speichern
    3. IMO Nummern extrahieren und unter data/latest/imos speichern
    4. Daten extrahiren
    5. PARQUET speichern

Signals 
    - max. einmal täglich
    1. API ShipData
    2. IMO Nummern laden
    3. Pro Schiff (parallelisiert)
        1. API Signals
        2. JSON speichern
        3. Daten extrahiren
        4. PARQUET speichern
        5. update der signal_mapping datei in data/latest (enthält imo, signal_id & friendly_name)


Timeseries 
    - mehrfach täglich (ca. 1x/h)
    1. API ShipData
    2. IMO Nummern laden
    3. Signal_mapping laden
    4. aktuelle Daily Summary in daily_df laden & mit dem tag today versehen
    5. Wenn noch keine Daily Summary Datei vorhanden, also daily_df leer ist, dann ref_data Datei aktualisieren, sodass nur die letzten x Tage darin vorkommen (ältesten Tag raus & letzten Tag rein)
    6. Reference Data in hist_df laden & mit dem tag old versehen 
    7. current_df anlegen
    8. Pro Schiff (parallelisiert)
        1. API Timeseries (to_date = run_timestamp einfügen damit keine Abweichungen durch Abfragezeit)
        2. JSON speichern
        3. Daten extrahiren
        4. Friendly_name anfügen (aus signal_mapping)
        5. PARQUET speichern
        6. einem gesamt Dataframe current_df hinzufügen
    9. Daily Summary erstellen / ergänzen
        - hist_df & daily_df & current_df zusammenfügen
        - nur behalten was unique ist, zu erst im df steht & den tag new oder today hat 
        - im summary_df speichern
    10. Daily summary (summary_df) als PARQUET speichern
        - wenn noch keine datei zum aktuellen Tag, dann neu anlegen, sonst nur überschreiben


In [12]:
class Pipeline:
    def __init__(self, config: Config, api_key: str, verify_ssl: bool = True):
        self.config = config
        self.api_client = API_Client(config.base_url, api_key)
        self.processor = Data_Processor()
        self.storage = Data_Storage(config)
        self.logger = logging.getLogger('Pipeline')
    
    def process_shipdata(self, run_timestamp: str):
        try:

            self.logger.info("Processing ship data")

            # Get Shipdata
            response, shipdata = self.api_client.get_data("fleet")

            # Get & Store IMO Numbers
            imo_numbers = self.processor.get_imo_numbers(shipdata)
            self.storage.write_file(
                imo_numbers,
                'imos',
                f"./data/latest",
                'parquet'
            )

            self.logger.info(f"Found {len(imo_numbers)} active ships")

            if not shipdata:
                self.logger.error(f"No ship data received - {response}")
                raise ValueError(f"No ship data received - {response}")
            
            # Store raw ship data
            self.storage.write_file(
                    shipdata,
                    'ShipData',
                    f"{self.config.raw_path}/{run_timestamp}",
                    'json'
                )
            
            # Transform and store ship data
            ships_df = pl.DataFrame(shipdata)
            if ships_df.columns == ['detail']:
                self.logger.info("Request Error: see json file for details")
            
            else:
                ships_transformed, tables = self.processor.transform_shipdata(ships_df, run_timestamp)
            
                self.storage.write_file(
                        ships_transformed,
                        'ShipData',
                        f"{self.config.transformed_path}/{run_timestamp}",
                        'parquet'
                    )
                    
                # Process nested tables
                for name, table in tables.items():
                        self.storage.write_file(
                            table,
                            f"ShipData_{name}",
                            f"{self.config.transformed_path}/{run_timestamp}",
                            'parquet'
                        )
            
        except Exception as e:
            self.logger.error(f"Failed to process ship data: {str(e)}")


    def process_signals(self, run_timestamp: str, imo_numbers:List[str], current_signals_df: pl.DataFrame):
        try:
            
            for imo in imo_numbers:

                self.logger.info(f"Processing signals data for {imo}")

                response, signals = self.api_client.get_data(f"fleet/{imo}/signals")

                if not signals:
                    self.logger.error(f"No signals data received - {response}")
                    ValueError(f"No signals data received - {response}")

                # Store raw signals data
                self.storage.write_file(
                        signals,
                        f'Signals_{imo}',
                        f"{self.config.raw_path}/{run_timestamp}",
                        'json'
                    )
                
                # Transform and store signals data
                signals_df = pl.DataFrame(signals)
                if signals_df.columns == ['detail']:
                    self.logger.info("Request Error: see json file for details")
                else:
                    signals_transformed = self.processor.transform_signals(signals_df, run_timestamp)
                    self.storage.write_file(
                            signals_transformed,
                            f'Signals_{imo}',
                            f"{self.config.transformed_path}/{run_timestamp}",
                            'parquet'
                        )
                    self.logger.info(f"Signals data processed for {imo}")
                
                    # Update Signals Mapping

                    new_signal_mapping = signals_transformed.select(["imo", "signal", "friendly_name"]).unique()

                    if current_signals_df is None:
                        self.storage.write_file(
                            new_signal_mapping,
                            'signal_mapping',
                            f"./data/latest",
                            'parquet'
                        )
                        self.logger.info(f"Signal Mapping updated for the first time")
                    else:        
                        # Add new Signals to existing Signal Mapping
                        
                        updated_signals = pl.concat([current_signals_df, new_signal_mapping]).unique(subset=["imo", "signal", "friendly_name"], keep="first")
                        
                        self.storage.write_file(
                                updated_signals,
                                "signal_mapping",
                                f"./data/latest",
                                "parquet")
                        self.logger.info(f"Signal Mapping updated")    
                
        except Exception as e:
            self.logger.error(f"Failed to process signals data: {str(e)}")

    def process_timeseries(self, run_timestamp: str, imo_numbers:List[str], signal_mapping:pl.DataFrame, current_df:pl.DataFrame, run_start)-> pl.DataFrame:
        try:
            current_df = current_df
            
            for imo in imo_numbers:

                self.logger.info(f"Processing timeseries data for {imo}")

                response, timeseries = self.api_client.get_data(f"fleet/{imo}/timeseries")
                #response, timeseries = self.api_client.get_data(f"fleet/{imo}/timeseries?to_date={run_start}")

                if not timeseries:
                        self.logger.error(f"No timeseries data received - {response}")
                        no_data = True


                # Store raw timeseries data
                self.storage.write_file(
                        timeseries,
                        f'Timeseries_{imo}',
                        f"{self.config.raw_path}/{run_timestamp}",
                        'json'
                    )
            
                
                
                # Transform and store timeseries data
                timeseries_df = pl.DataFrame(timeseries)

                if timeseries_df.columns == ['detail']:
                    self.logger.info("Request Error: see json file for details")
                else:
                    if no_data:
                        timeseries_transformed = pl.DataFrame({"signal": [], "signal_timestamp": [], "signal_value": [], "imo": [], "loaddate": [], "friendly_name": []})
                        gaps = pl.DataFrame()
                    else:
                    
                        timeseries_transformed, gaps = self.processor.transform_timeseries(timeseries_df, imo, run_timestamp)

                    # Enrich with friendly names
                    timeseries_transformed = self.processor.enrich_timeseries_with_friendly_names(timeseries_transformed, signal_mapping)
                    self.storage.write_file(
                            timeseries_transformed,
                            f"Timeseries_{imo}",
                            f"{self.config.transformed_path}/{run_timestamp}",
                            'parquet'
                        )
                    current_df = pl.concat([current_df, timeseries_transformed])
                    
                    # Process gaps
                    gaps_df = self.processor.process_gaps(pl.DataFrame(gaps))
                    self.storage.write_file(
                            gaps_df,
                            f"Gaps_{imo}",
                            f"{self.config.gaps_path}/{run_timestamp}",
                            'parquet'
                        )
                
        except Exception as e:            
            self.logger.error(f"Failed to process timeseries data: {str(e)}")

        return current_df


    def run(self, mode: str = "all"):

        try: 

            run_start = datetime.now(timezone.utc)
            run_timestamp = run_start.strftime('%Y/%m/%d/%H/%M')
            summary_filename = f"{run_start.strftime('%Y%m%d')}"
            self.logger.info(f"Starting pipeline run at {run_start}")

            # Initialize directories
            os.makedirs(f"{self.config.raw_path}/{run_timestamp}", exist_ok=True)
            os.makedirs(f"{self.config.transformed_path}/{run_timestamp}", exist_ok=True)
            os.makedirs(f"{self.config.gaps_path}/{run_timestamp}", exist_ok=True)
            os.makedirs(f"./data/latest", exist_ok=True)
            os.makedirs(f"./data/daily_summary", exist_ok=True)

            current_signals_df = self.storage.read_file("signal_mapping", "./data/latest", "parquet")

            # Read daily and historical data
            daily_df = self.storage.read_file(summary_filename, "./data/daily_summary", "parquet")
            hist_df = self.storage.read_file("ref_data", "./data/latest", "parquet")
            if hist_df.is_empty: hist_df = pl.DataFrame({"signal": [], "signal_timestamp": [], "signal_value": [], "imo": [], "loaddate": [], "friendly_name": [], "tag": []})
            

            if daily_df.is_empty : # means it is a new day

                daily_df = pl.DataFrame({"signal": [], "signal_timestamp": [], "signal_value": [], "imo": [], "loaddate": [], "friendly_name": [], "tag": []})
                
                last_day = (run_start - timedelta(days=1)).strftime('%Y%m%d')
                last_day_df = self.storage.read_file(last_day, "./data/daily_summary", "parquet")
                
                if last_day_df.is_empty: 
                    last_day_df = pl.DataFrame({"signal": [], "signal_timestamp": [], "signal_value": [], "imo": [], "loaddate": [], "friendly_name": [], "tag": []})
                else:
                    last_day_df = last_day_df.with_columns(pl.col("hist").alias("tag"))
                
                hist_df = pl.concat([hist_df, last_day_df])
                hist_df = hist_df.filter(pl.col("loaddate") > run_start - timedelta(days=self.config.history_days))
                self.storage.write_file(hist_df, 
                                "ref_data", 
                                "./data/latest", 
                                "parquet")
                
            else: daily_df = daily_df.with_columns(pl.col("today").alias("tag"))

            current_df = pl.DataFrame({"signal": [], "signal_timestamp": [], "signal_value": [], "imo": [], "loaddate": [], "friendly_name": []})

            self.logger.info(f"Loaded {hist_df.shape[0]} historical records and {daily_df.shape[0]} daily records")

            self.logger.info(f"Processing data for mode: {mode}")


            if mode in ["all", "fleet"]:

                # Process Shipdata
                self.process_shipdata(run_timestamp)

                # Get IMO Numbers and turn them into list
                imo_numbers = self.storage.read_file(
                    'imos',
                    f"./data/latest",
                    'parquet'
                )            
                imo_numbers = imo_numbers.to_series(0).to_list()

                self.logger.info(f"Processing signals for {len(imo_numbers)} ships")

                # Process Signals
                self.process_signals(run_timestamp, imo_numbers, current_signals_df)

                self.logger.info("All Signals processed")


            if mode in ["all", "timeseries"]:

                self.logger.info("Start to Process Timeseries Data")
                # Get IMO Numbers from file in data
                imo_numbers = self.storage.read_file(
                    'imos',
                    f"./data/latest",
                    'parquet'
                )
                imo_numbers = imo_numbers.to_series(0).to_list()

                # Get Signal-Mapping
                signal_mapping = self.storage.read_file(
                    'signal_mapping',
                    f"./data/latest",
                    'parquet'
                )

                
                
                # Process Timeseries
                current_df = self.process_timeseries(run_timestamp, imo_numbers, signal_mapping, current_df, run_start)
                current_df = current_df.with_columns(pl.lit("new").alias("tag"))

                # Save Delta
                summary_df = self.processor.update_daily_timeseries_summary(hist_df, daily_df, current_df)
                self.storage.write_file(
                    summary_df,
                    summary_filename,
                    f"./data/daily_summary",
                    "parquet"
                )
            
            run_end = datetime.now(timezone.utc)
        
        except Exception as e:
            self.logger.error(f"Pipeline run failed at {run_end}: {str(e)}") 
            raise

        

        return run_start, run_end

        



In [13]:
logger = setup_logging()

# Load environment variables
load_dotenv()

# Check if environment variables are set
api_key = os.getenv('HOPPE_API_KEY')
if not api_key:
    logger.error("HOPPE_API_KEY environment variable not set")

# Configure pipeline
config = Config(
    base_url=os.getenv('HOPPE_BASE_URL', "https://api.hoppe-sts.com/"),
    raw_path=os.getenv('RAW_PATH', "./data/raw_data"),
    transformed_path=os.getenv('TRANSFORMED_PATH', "./data/transformed_data"),
    gaps_path=os.getenv('GAPS_PATH', "./data/gaps_data"),
    batch_size = int(os.getenv('BATCH_SIZE', "1000")),
    max_workers=int(os.getenv('MAX_WORKERS', "4")),
    days_to_keep=int(os.getenv('DAYS_TO_KEEP', "90")),
    history_days=int(os.getenv('HISTORY_DAYS', "5"))
)

mode = "timeseries" # "all" or "timeseries" or "fleet"

In [None]:
# Create and run pipeline
try:
    pipeline = Pipeline(config, api_key)
    run_start, run_end = pipeline.run(mode)  
    logger.info(f"Pipeline run completed at {run_end}: total runtime {run_end - run_start}")
except Exception as e:
    logger.error(f"Pipeline run failed: {str(e)}")

2025-03-18 18:32:13,756 - Pipeline - INFO - Starting pipeline run at 2025-03-18 17:32:13.756138+00:00
2025-03-18 18:32:13,783 - Data Storage - INFO - Reading from signal_mapping.parquet file successfully
2025-03-18 18:32:13,786 - Data Storage - INFO - File ./data/daily_summary/20250318.parquet does not exist
2025-03-18 18:32:13,792 - Data Storage - INFO - Reading from ref_data.parquet file successfully


2025-03-18 18:32:13,796 - Data Storage - INFO - Reading from 20250317.parquet file successfully
2025-03-18 18:32:13,803 - Data Storage - INFO - Writting to ref_data.parquet file successfully
2025-03-18 18:32:13,805 - Data Storage - INFO - Data saved to ./data/latest/ref_data.parquet
2025-03-18 18:32:13,807 - Pipeline - INFO - Loaded 0 historical records and 0 daily records
2025-03-18 18:32:13,808 - Pipeline - INFO - Processing data for mode: timeseries
2025-03-18 18:32:13,809 - Pipeline - INFO - Start to Process Timeseries Data
2025-03-18 18:32:13,813 - Data Storage - INFO - Reading from imos.parquet file successfully
2025-03-18 18:32:13,817 - Data Storage - INFO - Reading from signal_mapping.parquet file successfully
2025-03-18 18:32:13,818 - Pipeline - INFO - Processing timeseries data for 9400071
2025-03-18 18:32:14,017 - API Client - INFO - Request for fleet/9400071/timeseries successful
2025-03-18 18:32:14,017 - Pipeline - ERROR - No timeseries data received - <Response [200]>
202