In [None]:
%load_ext autoreload
%autoreload 2

## Make training and test data parquet files

The InputData has to ultimately be of the form dict[str, pd.DataFrame]

In [None]:
from os import PathLike
from pathlib import Path
import pandas as pd

BASE_DIR = Path("~/projects/darrow/darrow-poc/")


type_dict = {
    'altenburg1': 'discharge', 
    'eschweiler': 'discharge',
    'herzogenrath1': 'discharge',
    'juelich': 'discharge',
    'stah': 'discharge',
    'middenroer': 'precipitation',
    'urft': 'precipitation',
    'evap': 'evaporation',
}
    

def read_csv(file_path: PathLike) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    df = df.iloc[0:100, :].loc[:, 
        [
            'TIME', 
            'discharge_altenburg1', 
            'discharge_eschweiler',
            'discharge_herzogenrath1',
            'discharge_juelich',
            'discharge_stah',
            'precip_middenroer',
            'precip_urft',
            'evap',
         ]
    ]
    df.rename(columns={
        'discharge_altenburg1': 'altenburg1', 
        'discharge_eschweiler': 'eschweiler',
        'discharge_herzogenrath1': 'herzogenrath1',
        'discharge_juelich': 'juelich',
        'discharge_stah': 'stah',
        'precip_middenroer': 'middenroer',
        'precip_urft': 'urft',
    }, inplace=True)
    df = pd.melt(df, id_vars=['TIME'], value_vars=[c for c in df.columns if c != "TIME"])
    df["TIME"] = pd.to_datetime(df["TIME"])
    df.columns = ["TIME", "ID", "VALUE"]
    df['TYPE'] = df['ID'].apply(lambda x: type_dict[x])
    return df


train = read_csv(BASE_DIR / "tests/testing_data/train.csv")
test = read_csv(BASE_DIR / "tests/testing_data/test.csv")


train.to_parquet(BASE_DIR / "tests/testing_data/train.parquet")
test.to_parquet(BASE_DIR / "tests/testing_data/test.parquet")

In [None]:
pd.read_parquet(BASE_DIR / "tests/testing_data/test.parquet")

In [None]:
from twinn_ml_interface.input_data.input_data import InputData

In [None]:
InputData.from_long_df(train)

## We need to define the following:

1. Model class following the Protocol `ModelinterfaceV4`
2. As an input to the `initialize` method, this class requires an object based on the `Configuration` Protocol

In [None]:
from darrow_poc.models.poc import POCAnomaly

In [None]:
from twinn_ml_interface.objectmodels import ModelCategory, MetaDataLogger, Configuration

sm = POCAnomaly(target = "stah:discharge")

#sm.performance_value = 999
#sm.model_category = ModelCategory.ANOMALY
#sm.model_type_name = "stah"
#sm.base_features = None

In [None]:
class ConfigurationMock:
    target_name = "stah:discharge"

    def get_units(*args, **kwargs):
        return None
    
model = POCAnomaly.initialize(ConfigurationMock(), MetaDataLogger())

In [None]:
model.preprocess

In [None]:
from twinn_ml_interface.interface import ModelInterfaceV4

assert isinstance(sm, ModelInterfaceV4)

In [None]:
from azure.data.tables import TableServiceClient

from sam_infra_helper.api import MLAPI
from sam_infra_helper.azure import WorkspaceService
from sam_infra_helper.data import DataService, LabelService
from sam_infra_helper.executors.shared_code import CheckpointLogger, SafeLogger
from sam_infra_helper.hierarchy import Hierarchy
from sam_infra_helper.model import ModelService
from sam_infra_helper.objectmodels import SemanticVersion

In [None]:
from sam_infra_helper.database.connectors.sqlalchemy_connector import SQLalchemyConnector

connection_str_ml = f"..."  # Which connection string do I need here? Which DB to use?

connector_ml = SQLalchemyConnector(connection_str_ml)

ml_api = MLAPI(connector_ml)

In [None]:
from twinn_ml_example._version import __version__

semantic_version = SemanticVersion.from_string(__version__)
semantic_version

In [None]:
connection_str_dqls = f"..."
connector_dqls = SQLalchemyConnector(connection_str_dqls)

label_api = DQLSAPI(connector_dqls)

label_service = LabelService(label_api)

In [None]:
from azure.storage.blob import BlobServiceClient, ContainerClient
from sam_infra_helper.data import AvailabilityService

blob_service_client = BlobServiceClient(
    account_url: str, credential: str | Dict[str, str] | AzureNamedKeyCredential | 
    AzureSasCredential | TokenCredential | None = None, **kwargs: Any
)
availability_service = AvailabilityService(blob_service_client)

data_service = DataService(availability_service)

In [None]:
model_container_client = ContainerClient(
    account_url: str, container_name: str, credential: str | Dict[str, str] | 
    AzureNamedKeyCredential | AzureSasCredential | TokenCredential | None = None, **kwargs: Any
)
log_container_client = ContainerClient(
    account_url: str, container_name: str, credential: str | Dict[str, str] | 
    AzureNamedKeyCredential | AzureSasCredential | TokenCredential | None = None, **kwargs: Any
)
credentials = None
model_service = ModelService(model_container_client, log_container_client, credentials)

In [None]:
safe_logger = SafeLogger()

In [None]:
checkpoint_logger = CheckpointLogger()

In [None]:
from azure.data.tables import TableServiceClient

connection_string = \
    "DefaultEndpointsProtocol=https;AccountName=<my_account_name>;AccountKey=<my_account_key>;EndpointSuffix=core.windows.net"
mlsync_service = TableServiceClient.from_connection_string(conn_str=connection_string)

In [None]:
from azure.storage.blob import BlobClient

blob_client = BlobClient(
    account_url: str, container_name: str, blob_name: str, snapshot: str | 
    Dict[str, Any] | None = None, credential: str | Dict[str, str] | AzureNamedKeyCredential | 
    AzureSasCredential | TokenCredential | None = None, **kwargs: Any
)
hierarchies = HierarchyFromBlobTree(blob_client).get_hierarchies()

In [None]:
workspace_service = WorkspaceService(
    workspace_name: str,
    subscription_id: str,
    resource_group: str,
    credentials: Any = None,
)

In [None]:
train_executor = TrainExecutor(
    ml_api = ml_api,
    product_version = semantic_version,
    label_service = label_service,
    data_service = data_service,
    model_service = model_service,
    safe_logger = safe_logger,
    checkpoint_logger = checkpoint_logger,
    mlsync_service = mlsync_service,
    hierarchies = hierarchies,
    workspace_service = workspace_service,
    model_dir = None,
)

In [None]:
from typing import Protocol


type(Protocol)

In [None]:
type(type(Protocol))

In [None]:
type(Protocol)

In [None]:
from typing import _ProtocolMeta

In [None]:
_ProtocolMeta??

In [None]:
type(Protocol) is _ProtocolMeta