run the terminal command `mlflow server --host 127.0.0.1 --port 8080 
`
<br><br> Experiment to add window_size as a param

In [1]:
import mlflow

from pathlib import Path

import polars as pl
from mlflow.models import infer_signature
from sklearn import metrics
from tqdm import tqdm
import gc

from lisa.config import INTERIM_DATA_DIR, PLOTS_DIR
from lisa.features import sliding_window, standard_scaler, train_test_split
from lisa.modeling import random_forest
from lisa import evaluate

import os

mlflow.set_tracking_uri(uri="http://127:8080")


[32m2024-09-05 13:05:31.711[0m | [1mINFO    [0m | [36mlisa.config[0m:[36m<module>[0m:[36m15[0m - [1mPROJ_ROOT path is: /Users/tomwilson/code/LISA[0m


In [3]:
# ensure that mlruns are saved in the correct directory
os.chdir("..")

input_path: Path = INTERIM_DATA_DIR / "labelled_test_data.csv"

original_df = pl.read_csv(input_path)

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("RF Test")

# Start an MLflow run
with mlflow.start_run() as parent_run:
    windows = [50, 150]
    splits = [0.6, 0.7]
    for window in tqdm(windows, desc="Windows", position=0):
        for split in tqdm(splits, desc="Splits", leave=False, position=1):
            with mlflow.start_run(nested=True, run_name=f"W_{window}:S_{split}"):

                df = sliding_window(original_df, period=window, log=True)

                X_train, X_test, y_train, y_test = train_test_split(
                    df, train_size=split, gap=window
                )

                scaled_X_train, scaled_X_test, scaler = standard_scaler(X_train, X_test)

                params = {"n_estimators": 100, "max_depth": 128}

                model = random_forest.random_forest_classifier(
                    scaled_X_train, y_train.to_numpy().ravel(), **params
                )

                accuracy = metrics.accuracy_score(y_test, model.predict(scaled_X_test))
                labels = df["ACTIVITY"].unique(maintain_order=True)
                plot_path = PLOTS_DIR / "tmp/confusion_matrix.png"
                cm = evaluate.confusion_matrix(model, labels, scaled_X_test, y_test, plot_path)
                
                # Log the hyperparameters
                params["window"] = window
                params["split"] = split
                mlflow.log_params(params)

                # Log metrics
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_artifact(plot_path)

                # Set a tag that we can use to remind ourselves what this run was for
                mlflow.set_tag("Training Info", "Basic RF model for labelled test data")

                # Infer the model signature
                signature = infer_signature(
                    scaled_X_train, model.predict(scaled_X_train)
                )

                # Log the model
                mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="rf_model",
                    signature=signature,
                    input_example=scaled_X_train,
                )

                # Explicitly delete objects to free memory
                del df, X_train, X_test, y_train, y_test, scaled_X_train, scaled_X_test, model, cm
                gc.collect()  # Run garbage collection

                                              
Windows:   0%|          | 0/2 [00:00<?, ?it/s]

[32m2024-09-05 12:00:18.438[0m | [1mINFO    [0m | [36mlisa.features[0m:[36msliding_window[0m:[36m134[0m - [1mAggregating data...[0m


                                              
Windows:   0%|          | 0/2 [09:31<?, ?it/s]        

[32m2024-09-05 12:09:50.364[0m | [1mINFO    [0m | [36mlisa.features[0m:[36msliding_window[0m:[36m134[0m - [1mAggregating data...[0m


                                                        
Windows:  50%|█████     | 1/2 [21:55<21:55, 1315.54s/it]

[32m2024-09-05 12:22:13.997[0m | [1mINFO    [0m | [36mlisa.features[0m:[36msliding_window[0m:[36m134[0m - [1mAggregating data...[0m


                                                        
Windows:  50%|█████     | 1/2 [30:44<21:55, 1315.54s/it]

[32m2024-09-05 12:31:02.512[0m | [1mINFO    [0m | [36mlisa.features[0m:[36msliding_window[0m:[36m134[0m - [1mAggregating data...[0m


python(54838) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Windows: 100%|██████████| 2/2 [41:56<00:00, 1258.28s/it]


In [9]:
# ensure that mlruns are saved in the correct directory
from loguru import logger
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

os.chdir("..")

input_path: Path = INTERIM_DATA_DIR / "labelled_test_data.csv"

original_df = pl.read_csv(input_path)

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("RF Test")

# Start an MLflow run
with mlflow.start_run() as parent_run:
    windows = [300]
    splits = [0.8]
    for window in tqdm(windows, desc="Windows", position=0):
        for split in tqdm(splits, desc="Splits", leave=False, position=1):
            with mlflow.start_run(nested=True, run_name=f"W_{window}:S_{split}"):

                #  Feature engineering with params
                df = sliding_window(original_df, period=window, log=True)

                X_train, X_test, y_train, y_test = train_test_split(
                    df, train_size=split, gap=window
                )

                scaled_X_train, scaled_X_test, scaler = standard_scaler(X_train, X_test)

                # Tune model
                param_dist = {"n_estimators": randint(50, 500), "max_depth": randint(10, 150)}

                rf = RandomForestClassifier()
                # Use random search to find the best hyperparameters
                rand_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=5, cv=5, n_jobs=-1, random_state=42)
                rand_search.fit(X_train, y_train)

                model = rand_search.best_estimator_

                # Print the best hyperparameters
                logger.info("Best hyperparameters:", rand_search.best_params_)
                params = rand_search.best_params_

                accuracy = metrics.accuracy_score(y_test, model.predict(scaled_X_test))
                labels = df["ACTIVITY"].unique(maintain_order=True)
                plot_path = PLOTS_DIR / "tmp/confusion_matrix.png"
                cm = evaluate.confusion_matrix(model, labels, scaled_X_test, y_test, plot_path)
                
                # Log the hyperparameters
                params["window"] = window
                params["split"] = split
                mlflow.log_params(params)

                # Log metrics
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_artifact(plot_path)

                # Set a tag that we can use to remind ourselves what this run was for
                mlflow.set_tag("Training Info", "Basic RF model for labelled test data")

                # Infer the model signature
                signature = infer_signature(
                    scaled_X_train, model.predict(scaled_X_train)
                )

                # Log the model
                mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="rf_model",
                    signature=signature,
                    input_example=scaled_X_train,
                )

                # Explicitly delete objects to free memory
                del df, X_train, X_test, y_train, y_test, scaled_X_train, scaled_X_test, model, cm
                gc.collect()  # Run garbage collection

                                              
Windows:   0%|          | 0/1 [00:00<?, ?it/s]--- Logging error in Loguru Handler #1 ---
Record was: {'elapsed': datetime.timedelta(seconds=3578, microseconds=539581), 'exception': None, 'extra': {}, 'file': (name='features.py', path='/Users/tomwilson/code/LISA/lisa/features.py'), 'function': 'sliding_window', 'level': (name='INFO', no=20, icon='ℹ️'), 'line': 134, 'message': 'Aggregating data...', 'module': 'features', 'name': 'lisa.features', 'process': (id=53608, name='MainProcess'), 'thread': (id=8256245568, name='MainThread'), 'time': datetime(2024, 9, 5, 12, 59, 51, 535830, tzinfo=datetime.timezone(datetime.timedelta(seconds=3600), 'BST'))}
Traceback (most recent call last):
  File "/Users/tomwilson/micromamba/envs/LISA/lib/python3.10/site-packages/loguru/_handler.py", line 206, in emit
    self._sink.write(str_record)
  File "/Users/tomwilson/micromamba/envs/LISA/lib/python3.10/site-packages/loguru/_simple_sinks.py", line 122, in wri

[32m2024-09-05 12:59:51.535[0m | [1mINFO    [0m | [36mlisa.features[0m:[36msliding_window[0m:[36m134[0m - [1mAggregating data...[0m


python(55583) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55585) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55590) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55591) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55592) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55594) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55595) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55596) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55597) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55598) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(55599) Malloc

: 

## Trying to do the same with logistic regression!!

In [4]:
from lisa.modeling.logistic_regression import logistic_regression

os.chdir("..")

input_path: Path = INTERIM_DATA_DIR / "labelled_test_data.csv"

original_df = pl.read_csv(input_path)

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("LR Test")

# Start an MLflow run
with mlflow.start_run() as parent_run:
    windows = [300]
    splits = [0.8]
    for window in tqdm(windows, desc="Windows"):
        for split in tqdm(splits, desc="Splits"):
            with mlflow.start_run(nested=True, run_name=f"W_{window}:S_{split}"):

                df = sliding_window(original_df, period=window, log=True)

                X_train, X_test, y_train, y_test = train_test_split(
                    df, train_size=split, gap=window
                )

                scaled_X_train, scaled_X_test, scaler = standard_scaler(X_train, X_test)

                model = logistic_regression(scaled_X_train, y_train)

                accuracy = metrics.accuracy_score(y_test, model.predict(scaled_X_test))
                labels = df["ACTIVITY"].unique(maintain_order=True)
                plot_path = PLOTS_DIR / "tmp/confusion_matrix.png"
                cm = evaluate.confusion_matrix(model, labels, scaled_X_test, y_test, plot_path)
                
                # Log the hyperparameters
                params = {}
                params["window"] = window
                params["split"] = split
                mlflow.log_params(params)

                # Log metrics
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_artifact(plot_path)

                # Set a tag that we can use to remind ourselves what this run was for
                mlflow.set_tag("Training Info", "Basic LR model for labelled test data")

                # Infer the model signature
                signature = infer_signature(
                    scaled_X_train, model.predict(scaled_X_train)
                )

                # Log the model
                mlflow.sklearn.log_model(
                    sk_model=model,
                    artifact_path="lr_model",
                    signature=signature,
                    input_example=scaled_X_train,
                )

                # Explicitly delete objects to free memory
                del df, X_train, X_test, y_train, y_test, scaled_X_train, scaled_X_test, model, cm
                gc.collect()  # Run garbage collection

Windows:   0%|          | 0/1 [00:00<?, ?it/s]
Windows:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2024-09-05 13:38:30.179[0m | [1mINFO    [0m | [36mlisa.features[0m:[36msliding_window[0m:[36m134[0m - [1mAggregating data...[0m
