In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import sys 
sys.path.insert(0, 'E:\\repos\\NYC_tax_demand_predictor')

In [None]:
import hopsworks
import src.config as config

In [None]:
print(config.HOPSWORKS_PROJECT)

In [None]:
project = hopsworks.login(project=config.HOPSWORKS_PROJECT,
                          api_key_value= config.HOPSWORKS_API_KEY)

feature_store = project.get_feature_store()

feature_group= feature_store.get_feature_group(
    name= config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION
)

In [None]:
try:
    feature_store.create_feature_view(
        name = config.FEATURE_VIEW_NAME,
        version= config.FEATURE_VIEW_VERSION,
        query= feature_group.select_all()
    )
except:
    print("Feature view already created. skipping creation")



In [None]:
feature_store.create_feature_view(
        name = config.FEATURE_VIEW_NAME,
        version= config.FEATURE_VIEW_VERSION,
        query= feature_group.select_all()
    )

In [None]:
feature_view= feature_store.get_feature_view(
name= config.FEATURE_VIEW_NAME,
version= config.FEATURE_VIEW_VERSION
)

In [None]:
ts_data , _ = feature_view.training_data(
    description= 'time series data - hourly taxi rides',statistics_config=False
)

In [None]:
ts_data.head()

In [None]:
ts_data.sort_values(by= ['pu_location','pu_hour'],inplace=True)

In [None]:
ts_data.head()

In [None]:
from src.data import load_raw_data,transform_to_ts_data

ts_data = load_raw_data(year=2022)
ts_data = transform_to_ts_data(ts_data)

In [None]:
ts_data

In [None]:
from src.data import transform_ts_data_to_features_and_targets

features, targets = transform_ts_data_to_features_and_targets(
    ts_data=ts_data,
    no_features = 24 * 28,
    step_size = 23
)

features_and_targets = features.copy()
features_and_targets['target'] = targets

print(f'shape {features_and_targets.shape}')


In [None]:
features_and_targets.head()

In [None]:
from datetime import date , timedelta
from src.data_split import train_test_split
import pandas as pd


cut_off_date = pd.to_datetime(date.today()-timedelta(days= 200))

print(f'cutoff date {cut_off_date}')

X_train, X_test, y_train, y_test = train_test_split(
    features_and_targets,
    cut_off_year= cut_off_date.year,
    cut_off_day= cut_off_date.day,
    cut_off_month= cut_off_date.month,
    target_column = 'target'
)

print(f'training features {X_train.shape}')
print(f'training targets {X_test.shape}')
print(f'testing featues {y_train.shape}')
print(f'test target {y_test.shape}')

In [None]:
from src.model import training_pipeline
import numpy as np

import optuna
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline

def objective(trial :optuna.trial.Trial) -> float:

    hyperparameters = {
         "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    time_series_split = TimeSeriesSplit(n_splits=2)
    scores=[]

    for train_split, validataion_split in time_series_split.split(X_train):
        X_train_, X_val_ = X_train.iloc[train_split,:], X_train.iloc[validataion_split,:]
        y_train_, y_val_ = y_train.iloc[train_split], y_train.iloc[validataion_split]

        pipeline = training_pipeline(**hyperparameters)
        pipeline.fit(X_train_,y_train_)

        y_preds = pipeline.predict(X_val_)

        error = mae(y_val_,y_preds)

        scores.append(error)
    
    return np.array(scores).mean()



In [None]:
optuna_study = optuna.create_study(direction='minimize')
optuna_study.optimize(objective, n_trials=3)

In [None]:
best_params = optuna_study.best_params
optuna_study.best_params

In [None]:
#training full dataset

pipeline = training_pipeline(**best_params)
pipeline.fit(X_train,y_train)

In [None]:
X_test.shape


In [None]:
predictions = pipeline.predict(X_test)
test_mae = mae(y_test, predictions)
print(f'{test_mae=:.4f}')

In [None]:
import joblib
from src.paths import MODEL_DIR

joblib.dump(pipeline, MODEL_DIR/'model_1.pkl')

In [None]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema = output_schema)



In [None]:
import src.config as config
import hopsworks

project =  hopsworks.login(
    project = config.HOPSWORKS_PROJECT,
    api_key_value= config.HOPSWORKS_API_KEY
)


In [None]:
model_registry = project.get_model_registry()

In [None]:
model = model_registry.sklearn.create_model(
    name= "model_to_predict_taxi_demand_for_the_next_hour",
    metrics= {"test_mae":test_mae},
    description = 'Lgbm regressor with optuna tuned hyperparameters',
    input_example= X_train.sample(),
    model_schema= model_schema
    
)

model.save(MODEL_DIR / 'model_1.pkl')