In [31]:
from datetime import datetime, timedelta
from feast import FeatureStore
import pandas as pd
import logging

FEATURE_SERVICE = "crypto_stats"
ENTITY = "symbol"
ENTITY_IDs = ["BTC/USD", "ETH/USD" ]
FROM = ""
TO = datetime.utcnow().timestamp()
FREQUENCY= "5T"
try:
    FROM = float(FROM)
except ValueError:
    FROM = (datetime.utcnow() - timedelta(minutes = 150 * 5)).timestamp()

entity_dfs = []
for entity_id in ENTITY_IDs:
    entity_df = pd.DataFrame.from_dict(
        {
            ENTITY: entity_id,
            "event_timestamp": [item for item in pd.date_range(datetime.fromtimestamp(FROM), datetime.utcnow(), freq=FREQUENCY)]
        }
    )
    entity_dfs.append(entity_df)

feature_store = FeatureStore("../platform/feature-store/feature_store/feature_repo")  # Initialize the feature store
feature_service = feature_store.get_feature_service(FEATURE_SERVICE)

training_dfs = []
for entity_df in entity_dfs:
    training_df = feature_store.get_historical_features(features=feature_service, entity_df=entity_df).to_df()
    training_df = training_df.set_index(pd.DatetimeIndex(training_df['event_timestamp']))
    training_dfs.append(training_df)

for i in range(1, len(training_dfs)):
    df = training_dfs[0].join(training_dfs[i], lsuffix="_"+ENTITY_IDs[0][:3].lower(), rsuffix="_"+ENTITY_IDs[i][:3].lower())
    
    

df = df.dropna()
df['y'] = df['close_btc'].shift(-1)
df = df.iloc[:-1]

first = df.index[0].timestamp()
last = df.index[-1].timestamp()


#logging.warning(df.head())
keys = [x for x in df.columns if "symbol" not in x and "timestamp" not in x]
df = df[keys].drop_duplicates()
logging.warning(df.shape)



In [17]:
df.shape

(150, 9)

In [19]:
df.columns

Index(['open_btc', 'high_btc', 'low_btc', 'close_btc', 'open_eth', 'high_eth',
       'low_eth', 'close_eth', 'y'],
      dtype='object')

In [40]:
from sklearn.model_selection import train_test_split
import pickle
import os
import argparse
import logging
import numpy as np
import pandas as pd
import mlflow
from mlflow.models.signature import infer_signature
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split

os.environ["AWS_ACCESS_KEY_ID"] = "mlflow"
os.environ["AWS_SECRET_ACCESS_KEY"] = "mlflow123"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = f"http://mlflow-minio.mlflow.svc.cluster.local:9000/"

mlflow.set_tracking_uri("http://mlflow.mlflow.svc.cluster.local")
mlflow.set_experiment("bitcoin-xgb-tests")

df.index = pd.to_datetime(df.index)
first = df.index[0].timestamp()
last = df.index[-1].timestamp()
keys = list(df.columns)
keys.remove("y")
X = df[keys]
y = df["y"]

random_state=int("42")
train_size = 0.9
X = df[['open_btc', 'high_btc', 'low_btc', 'close_btc', 'open_eth', 'high_eth', 'low_eth', 'close_eth']]
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=random_state)

np.random.seed(random_state)

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

parameters = {
    'n_estimators': [600, 800, 1000],
    'learning_rate': [0.01, 0.2],
    'max_depth': [2, 3, 4],
    'gamma': [0.0001, 0.001],
    'random_state': [42]
}
model = XGBRegressor(objective='reg:squarederror')
clf = GridSearchCV(model, parameters, n_jobs=-1)
clf.fit(X_train, y_train)

with mlflow.start_run():
    mlflow.xgboost.autolog()
    model = XGBRegressor(**clf.best_params_, objective='reg:squarederror')
    model.fit(X_train, y_train, verbose=False)
    logging.warning(clf.best_params_)
    # Evaluate the best model with testing data.
    y_hat = model.predict(X_test)
    (rmse, mae, r2) = eval_metrics(y_test, y_hat)
    mlflow.log_param("data_from", first)
    mlflow.log_param("data_to", last)
    mlflow.log_param("feature_view", "crypto_stats")
    mlflow.log_param("framework", "xgboost")
    mlflow.log_param("random_state", random_state)
    mlflow.log_param("num_items", len(df))
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    model_signature = infer_signature(X_train, y_train)
    #info = mlflow.xgboost.log_model(model, "model", registered_model_name="BitcoinForecast", signature=model_signature)
    uri = mlflow.get_artifact_uri()
    storage_uri = f"{uri}/model"

2022/10/12 14:14:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '02bbb5bad7b34d4698a38b252169ce98', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


In [41]:
y_test, y_hat

(event_timestamp
 2022-10-12 05:27:09.345485    19087.22
 2022-10-12 00:52:09.345485    19094.23
 2022-10-12 09:12:09.345485    19156.71
 2022-10-12 05:52:09.345485    19111.91
 2022-10-12 05:42:09.345485    19107.92
 2022-10-12 01:57:09.345485    19059.97
 2022-10-12 04:42:09.345485    19107.10
 2022-10-12 11:07:09.345485    19141.88
 2022-10-12 05:02:09.345485    19089.50
 2022-10-12 06:12:09.345485    19135.16
 2022-10-12 08:32:09.345485    19130.80
 2022-10-12 00:22:09.345485    19118.54
 2022-10-12 02:22:09.345485    19065.58
 2022-10-12 00:07:09.345485    19032.20
 2022-10-12 00:57:09.345485    19087.94
 Name: y, dtype: float64,
 array([19086.85 , 19102.658, 19151.89 , 19125.459, 19081.594, 19056.05 ,
        19089.908, 19151.791, 19095.271, 19115.363, 19133.945, 19076.47 ,
        19053.371, 19057.771, 19093.463], dtype=float32))

In [42]:
rmse, mae, r2

(17.16312002823476, 13.240088541666774, 0.7157082243771994)