In [2]:
#importing sys
import sys
  
# adding src to the system path
sys.path.insert(0, '../src')

In [6]:
# %% [markdown]
# ## Step 1: Imports and Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.classification.kernel_based import RocketClassifier
from ml_pipeline.featureset import  make_summary_stats_input, make_series_input
from ml_pipeline.model import processing_pipeline, train_traditional_model, test_model, plot_model_result

# Load training and test data
train = pd.read_csv('../input/train/data.csv', parse_dates=['Time']).set_index('Time')
test = pd.read_csv('../input/test/data.csv', parse_dates=['Time']).set_index('Time')

# Dictionary to store all data configurations
data_dict = {'stat': {}, '1col': {}, 'ts': {}}

# %% [markdown]
# ## Step 2: Data Preprocessing and Feature Engineering

# Create summary statistics for traditional ML
data_dict['stat']['X_train'], data_dict['stat']['y_train'] = processing_pipeline(
    train[-3100:-2900],
    window_kwargs={'window_length': 30, 'step_length': 1, 'fh': 0},
    feature_func=make_summary_stats_input
)
data_dict['stat']['X_test'], data_dict['stat']['y_test'] = processing_pipeline(
    test[4200:4400],
    window_kwargs={'window_length': 30, 'step_length': 1, 'fh': 0},
    feature_func=make_summary_stats_input
)

# Single-column time series
data_dict['1col']['X_train'], data_dict['1col']['y_train'] = processing_pipeline(
    train[-3100:-2900][['Temperature', 'Status']],
    window_kwargs={'window_length': 30, 'step_length': 1, 'fh': 0},
    feature_func=make_series_input,
    add_first_diffs=False
)
data_dict['1col']['X_test'], data_dict['1col']['y_test'] = processing_pipeline(
    test[4200:4400][['Temperature', 'Status']],
    window_kwargs={'window_length': 30, 'step_length': 1, 'fh': 0},
    feature_func=make_series_input,
    add_first_diffs=False
)

# Multi-column time series
data_dict['ts']['X_train'], data_dict['ts']['y_train'] = processing_pipeline(
    train[-3100:-2900],
    window_kwargs={'window_length': 30, 'step_length': 1, 'fh': 0},
    feature_func=make_series_input,
    add_first_diffs=True
)
data_dict['ts']['X_test'], data_dict['ts']['y_test'] = processing_pipeline(
    test[4200:4400],
    window_kwargs={'window_length': 30, 'step_length': 1, 'fh': 0},
    feature_func=make_series_input,
    add_first_diffs=True
)

# %% [markdown]
# ## Step 3: Define Models and Parameter Grids for Model Selection

# Define model candidates and parameter grids
models = {
    "TimeSeriesForest": TimeSeriesForestClassifier(),
    "RocketClassifier": RocketClassifier()
}
param_grids = {
    "TimeSeriesForest": {"n_estimators": [50, 100]},
    "RocketClassifier": {"rocket__num_kernels": [500, 1000]}
}

# %% [markdown]
# ## Step 4: Model Selection and Training

def model_selection(X_train, y_train):
    model_results = {}
    
    for model_name, model in models.items():
        grid = GridSearchCV(model, param_grids[model_name], cv=5, scoring="accuracy", n_jobs=-1)
        grid.fit(X_train, y_train)
        
        # Save the best model and its score
        model_results[model_name] = {
            "best_model": grid.best_estimator_,
            "best_score": grid.best_score_,
            "best_params": grid.best_params_
        }
        print(f"{model_name} - Best Score: {grid.best_score_}, Best Params: {grid.best_params_}")
    
    # Select the best model based on the score
    best_model_name = max(model_results, key=lambda x: model_results[x]["best_score"])
    best_model = model_results[best_model_name]["best_model"]
    print(f"Selected Model: {best_model_name}")
    
    return best_model, model_results

# Perform model selection for the summary statistics dataset as an example
best_model_stat, stat_model_results = model_selection(data_dict['stat']['X_train'], data_dict['stat']['y_train'])

# %% [markdown]
# ## Step 5: Testing and Evaluating the Best Model

# Test the best selected model on the test data
def evaluate_model(model, X_test, y_test, threshold=0.5):
    result = test_model(X_test, y_test, model, threshold)
    accuracy = (result['residual'] == 0).mean()
    print(f"Accuracy at threshold {threshold}: {accuracy}")
    
    # Plot results
    plot_model_result(result)
    return accuracy

# Evaluate the best model on the test set
accuracy = evaluate_model(best_model_stat, data_dict['stat']['X_test'], data_dict['stat']['y_test'])

# %% [markdown]
# ## Step 6: Repeat for Other Data Configurations (Optional)
# You can repeat model selection and evaluation for the '1col' and 'ts' datasets.


ModuleNotFoundError: No module named 'ml_pipeline'