In [37]:
import pandas as pd
import numpy as np
import os
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import joblib
import boto3

In [10]:
os.environ["AWS_PROFILE"] = "park_pulse" 

TRACKING_SERVER_HOST = "ec2-54-90-62-154.compute-1.amazonaws.com"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

In [11]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")


tracking URI: 'http://ec2-54-90-62-154.compute-1.amazonaws.com:5000'


In [12]:
mlflow.search_experiments()


[<Experiment: artifact_location='s3://parkpulse-mlflow-artifacts/0', creation_time=1717404637620, experiment_id='0', last_update_time=1717404637620, lifecycle_stage='active', name='Default', tags={}>]

In [13]:
mlflow.set_experiment("version 1")

2024/06/03 12:02:33 INFO mlflow.tracking.fluent: Experiment with name 'version 1' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://parkpulse-mlflow-artifacts/1', creation_time=1717408953570, experiment_id='1', last_update_time=1717408953570, lifecycle_stage='active', name='version 1', tags={}>

In [14]:
file_path = 'data/checkpoint5_neighbourhood_district_data.csv'
df = pd.read_csv(file_path)

In [15]:
# Define features and target variable
features = df[['icon', 'moon_phase', 'neighbourhood_bcn', 'precipitation', 'visibility', 'cloud_cover', 'snow', 'temp', 'is_weekend']]
target = df['estimated_occupancy_rate']

In [16]:
# Drop rows with missing values in the selected columns
df = df[features.columns.tolist() + [target.name]].dropna()

In [17]:
# Normalize data
scaler = MinMaxScaler()
df[['precipitation', 'visibility', 'cloud_cover', 'snow', 'temp']] = scaler.fit_transform(df[['precipitation', 'visibility', 'cloud_cover', 'snow', 'temp']])

In [18]:
# Define weights for weighted average
weights = {'estimated_occupancy_rate': 0.5, 'precipitation': 0.15, 'visibility': 0.1, 'cloud_cover': 0.1, 'is_weekend': 0.1, 'snow': 0.1, 'temp': 0.1}

In [19]:
# Calculate weighted average for estimated occupancy rate
df['weighted_occupancy'] = (
    df['estimated_occupancy_rate'] * weights['estimated_occupancy_rate'] +
    df['precipitation'] * weights['precipitation'] +
    df['visibility'] * weights['visibility'] +
    df['cloud_cover'] * weights['cloud_cover'] +
    df['is_weekend'] * weights['is_weekend'] +
    df['snow'] * weights['snow'] +
    df['temp'] * weights['temp']
)

In [20]:
# Define features and target variable for model training
features = df[['icon', 'moon_phase', 'neighbourhood_bcn', 'weighted_occupancy']]
target = df['estimated_occupancy_rate']

In [21]:
# Split the data into training and testing sets
X = df[features.columns]
y = df[target.name]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['weighted_occupancy', 'moon_phase']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['icon', 'neighbourhood_bcn'])
    ]
)

In [24]:
# enable autologging
mlflow.sklearn.autolog()



In [33]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "RandomForest")
        mlflow.log_params(params)
        # Create pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', RandomForestRegressor(
                n_estimators=int(params['n_estimators']),
                max_depth=int(params['max_depth']),
                min_samples_split=int(params['min_samples_split']),
                min_samples_leaf=int(params['min_samples_leaf']),
                random_state=42
            ))
        ])
        # Fit the model
        pipeline.fit(X_train, y_train)

        # Predict on the test set
        y_pred = pipeline.predict(X_test)
        # Perform cross-validation
        # score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        # rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)
        mlflow.sklearn.log_model(pipeline, artifact_path="models_pickle")

        # Save the preprocessor
        preprocessor_filename = 'scales/preprocessor.pkl'
        joblib.dump(preprocessor, preprocessor_filename)
        mlflow.log_artifact(local_path="scales/preprocessor.pkl", artifact_path="scales_pickle")
    return {'accuracy': r2, 'loss': mse, 'status': STATUS_OK}

In [34]:
# Define the search space for hyperparameters
search_space = {
    'n_estimators': hp.quniform('n_estimators', 100, 500, 1),
    'max_depth': hp.quniform('max_depth', 1, 20, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1)
}

In [38]:
# Run Hyperopt optimization
trials = Trials()
best_params = fmin(fn=objective,
                   space=search_space,
                   algo=tpe.suggest,
                   max_evals=50,
                   trials=trials)

# Print the best hyperparameters
print("Best hyperparameters:", best_params)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 50/50 [22:43<00:00, 27.26s/trial, best loss: 0.0032213210243405527]
Best hyperparameters: {'max_depth': 18.0, 'min_samples_leaf': 1.0, 'min_samples_split': 3.0, 'n_estimators': 102.0}


#### Save testing data for further inference

In [42]:
X_test.to_csv('data/test_inference_data.csv', index=False)

In [43]:
y_test.to_csv('data/groundtruth_inference_data.csv', index=False)