In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Make training and test data parquet files

The InputData has to ultimately be of the form dict[str, pd.DataFrame]

In [15]:
from os import PathLike
from pathlib import Path
import pandas as pd

BASE_DIR = Path("~/projects/darrow/darrow-poc/")


type_dict = {
    'altenburg1': 'discharge', 
    'eschweiler': 'discharge',
    'herzogenrath1': 'discharge',
    'juelich': 'discharge',
    'stah': 'discharge',
    'middenroer': 'precip',
    'urft': 'precip',
    'evap': 'evap',
}
    

def read_csv(file_path: PathLike) -> pd.DataFrame:
    df = pd.read_csv(file_path)
    df = df.iloc[0:100, :].loc[:, 
        [
            'TIME', 
            'discharge_altenburg1', 
            'discharge_eschweiler',
            'discharge_herzogenrath1',
            'discharge_juelich',
            'discharge_stah',
            'precip_middenroer',
            'precip_urft',
            'evap',
         ]
    ]
    df.rename(columns={
        'discharge_altenburg1': 'altenburg1', 
        'discharge_eschweiler': 'eschweiler',
        'discharge_herzogenrath1': 'herzogenrath1',
        'discharge_juelich': 'juelich',
        'discharge_stah': 'stah',
        'precip_middenroer': 'middenroer',
        'precip_urft': 'urft',
    }, inplace=True)
    df = pd.melt(df, id_vars=['TIME'], value_vars=[c for c in df.columns if c != "TIME"])
    df["TIME"] = pd.to_datetime(df["TIME"])
    df.columns = ["TIME", "ID", "VALUE"]
    df['TYPE'] = df['ID'].apply(lambda x: type_dict[x])
    return df


train = read_csv(BASE_DIR / "tests/testing_data/train.csv")
test = read_csv(BASE_DIR / "tests/testing_data/test.csv")


train.to_parquet(BASE_DIR / "tests/testing_data/train.parquet")
test.to_parquet(BASE_DIR / "tests/testing_data/test.parquet")

In [16]:
pd.read_parquet(BASE_DIR / "tests/testing_data/test.parquet")

Unnamed: 0,TIME,ID,VALUE,TYPE
0,2018-01-01 00:00:00+00:00,altenburg1,33.22525,discharge
1,2018-01-01 01:00:00+00:00,altenburg1,33.55875,discharge
2,2018-01-01 02:00:00+00:00,altenburg1,33.59025,discharge
3,2018-01-01 03:00:00+00:00,altenburg1,34.38775,discharge
4,2018-01-01 04:00:00+00:00,altenburg1,35.03700,discharge
...,...,...,...,...
795,2018-01-04 23:00:00+00:00,evap,0.01000,evap
796,2018-01-05 00:00:00+00:00,evap,0.01000,evap
797,2018-01-05 01:00:00+00:00,evap,0.01000,evap
798,2018-01-05 02:00:00+00:00,evap,0.01000,evap


In [114]:
from twinn_ml_interface.objectmodels.input_data import InputData

In [115]:
InputData.from_long_df(train)

{'discharge_altenburg1:disc':                            discharge_altenburg1:disc
 TIME                                                
 2010-01-01 00:00:00+00:00                   13.32225
 2010-01-01 01:00:00+00:00                   13.30750
 2010-01-01 02:00:00+00:00                   13.20100
 2010-01-01 03:00:00+00:00                   13.29650
 2010-01-01 04:00:00+00:00                   13.19000
 ...                                              ...
 2010-01-04 23:00:00+00:00                   12.64500
 2010-01-05 00:00:00+00:00                   12.64500
 2010-01-05 01:00:00+00:00                   12.64500
 2010-01-05 02:00:00+00:00                   12.55600
 2010-01-05 03:00:00+00:00                   12.50300
 
 [100 rows x 1 columns],
 'discharge_eschweiler:disc':                            discharge_eschweiler:disc
 TIME                                                
 2010-01-01 00:00:00+00:00                    8.66275
 2010-01-01 01:00:00+00:00                    8.444

## We need to define the following:

1. Model class following the Protocol `ModelinterfaceV4`
2. As an input to the `initialize` method, this class requires an object based on the `Configuration` Protocol

In [94]:
from darrow_poc.models.modelinterface import POCAnomaly

In [104]:
from twinn_ml_interface.objectmodels import ModelCategory, MetaDataLogger, Configuration

sm = POCAnomaly(target = "discharge:stah")

#sm.performance_value = 999
#sm.model_category = ModelCategory.ANOMALY
#sm.model_type_name = "stah"
#sm.base_features = None

In [105]:
class ConfigurationMock:
    target_name = "test:test"

    def get_units(*args, **kwargs):
        return None
    
model = POCAnomaly.initialize(ConfigurationMock(), MetaDataLogger())

In [107]:
model.preprocess

<bound method POCAnomaly.preprocess of <darrow_poc.models.modelinterface.POCAnomaly object at 0x7f30d38b3d90>>

In [9]:
from twinn_ml_interface.interface import ModelInterfaceV4

assert isinstance(sm, ModelInterfaceV4)

AssertionError: 

In [5]:
from azure.data.tables import TableServiceClient

from sam_infra_helper.api import MLAPI
from sam_infra_helper.azure import WorkspaceService
from sam_infra_helper.data import DataService, LabelService
from sam_infra_helper.executors.shared_code import CheckpointLogger, SafeLogger
from sam_infra_helper.hierarchy import Hierarchy
from sam_infra_helper.model import ModelService
from sam_infra_helper.objectmodels import SemanticVersion

In [36]:
from sam_infra_helper.database.connectors.sqlalchemy_connector import SQLalchemyConnector

connection_str_ml = f"..."  # Which connection string do I need here? Which DB to use?

connector_ml = SQLalchemyConnector(connection_str_ml)

ml_api = MLAPI(connector_ml)

ArgumentError: Could not parse SQLAlchemy URL from string '...'

In [38]:
from twinn_ml_example._version import __version__

semantic_version = SemanticVersion.from_string(__version__)
semantic_version

SemanticVersion(major=0, minor=1, patch=0, dev_version=None)

In [None]:
connection_str_dqls = f"..."
connector_dqls = SQLalchemyConnector(connection_str_dqls)

label_api = DQLSAPI(connector_dqls)

label_service = LabelService(label_api)

In [None]:
from azure.storage.blob import BlobServiceClient, ContainerClient
from sam_infra_helper.data import AvailabilityService

blob_service_client = BlobServiceClient(
    account_url: str, credential: str | Dict[str, str] | AzureNamedKeyCredential | 
    AzureSasCredential | TokenCredential | None = None, **kwargs: Any
)
availability_service = AvailabilityService(blob_service_client)

data_service = DataService(availability_service)

In [None]:
model_container_client = ContainerClient(
    account_url: str, container_name: str, credential: str | Dict[str, str] | 
    AzureNamedKeyCredential | AzureSasCredential | TokenCredential | None = None, **kwargs: Any
)
log_container_client = ContainerClient(
    account_url: str, container_name: str, credential: str | Dict[str, str] | 
    AzureNamedKeyCredential | AzureSasCredential | TokenCredential | None = None, **kwargs: Any
)
credentials = None
model_service = ModelService(model_container_client, log_container_client, credentials)

In [None]:
safe_logger = SafeLogger()

In [None]:
checkpoint_logger = CheckpointLogger()

In [None]:
from azure.data.tables import TableServiceClient

connection_string = \
    "DefaultEndpointsProtocol=https;AccountName=<my_account_name>;AccountKey=<my_account_key>;EndpointSuffix=core.windows.net"
mlsync_service = TableServiceClient.from_connection_string(conn_str=connection_string)

In [None]:
from azure.storage.blob import BlobClient

blob_client = BlobClient(
    account_url: str, container_name: str, blob_name: str, snapshot: str | 
    Dict[str, Any] | None = None, credential: str | Dict[str, str] | AzureNamedKeyCredential | 
    AzureSasCredential | TokenCredential | None = None, **kwargs: Any
)
hierarchies = HierarchyFromBlobTree(blob_client).get_hierarchies()

In [None]:
workspace_service = WorkspaceService(
    workspace_name: str,
    subscription_id: str,
    resource_group: str,
    credentials: Any = None,
)

In [None]:
train_executor = TrainExecutor(
    ml_api = ml_api,
    product_version = semantic_version,
    label_service = label_service,
    data_service = data_service,
    model_service = model_service,
    safe_logger = safe_logger,
    checkpoint_logger = checkpoint_logger,
    mlsync_service = mlsync_service,
    hierarchies = hierarchies,
    workspace_service = workspace_service,
    model_dir = None,
)

In [11]:
from typing import Protocol


type(Protocol)

typing._ProtocolMeta

In [12]:
type(type(Protocol))

type

In [13]:
type(Protocol)

typing._ProtocolMeta

In [15]:
from typing import _ProtocolMeta

In [17]:
_ProtocolMeta??

In [19]:
type(Protocol) is _ProtocolMeta

True