<span style="font-width:bold; font-size: 3rem; color:#333;">- Part 01: Feature Backfill for Stock Market Data</span>




## Imports

In [1]:
import datetime
import requests
import pandas as pd
import hopsworks
import datetime
from pathlib import Path
from functions import util
import json
import re
import os
import warnings
warnings.filterwarnings("ignore")

### TO WIPE OUT ALL OF FEATURES AND MODELS, run the cell below

In [2]:
# If you haven't set the env variable 'HOPSWORKS_API_KEY', then uncomment the next line and enter your API key
# os.environ["HOPSWORKS_API_KEY"] = ""
# proj = hopsworks.login()
# util.purge_project(proj)

---

## Alphavantage API Key

In [3]:
api_key_file = '../data/alphavantage-api-key.txt'
util.check_file_path(api_key_file)

with open(api_key_file, 'r') as file:
    ALPHAVANTAGE_API_KEY = file.read().rstrip()

File successfully found at the path: ../data/alphavantage-api-key.txt


## Hopsworks API Key


In [4]:
with open('../data/hopsworks-api-key.txt', 'r') as file:
    os.environ["HOPSWORKS_API_KEY"] = file.read().rstrip()

project = hopsworks.login(project = 'StockPrediction', api_key_value=os.environ["HOPSWORKS_API_KEY"])

project

2025-01-05 17:12:32,020 INFO: Initializing external client
2025-01-05 17:12:32,020 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-05 17:12:34,499 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1205424


Project('StockPrediction', 'theresa.hoesl@t-online.de', 'Default project')

In [5]:
project.name

'StockPrediction'

In [6]:
secrets_api = hopsworks.get_secrets_api()
try:
    secrets_api.create_secret('AV_API_KEY', ALPHAVANTAGE_API_KEY)
except hopsworks.RestAPIError:
    ALPHAVANTAGE_API_KEY = secrets_api.get_secret("AV_API_KEY").value

### Validate that ALPHAVANTAGE_API_KEY works

In [7]:
try:
    sp_SPOT_df = util.get_stock_price('SPOT', ALPHAVANTAGE_API_KEY)
except hopsworks.RestAPIError:
    print("It looks like the AQI_API_KEY doesn't work for your sensor. Is the API key correct? Is the sensor URL correct?")

sp_SPOT_df.head()

Unnamed: 0,date,price
0,2025-01-03,466.690002


## Read CSV file into DataFrames

In [8]:
csv_file_SPOT="../data/daily_SPOT.csv"
csv_file_GOOGL="../data/daily_GOOGL.csv"
csv_file_BTC="../data/currency_daily_BTC_USD.csv"

In [9]:
df_SPOT = pd.read_csv(csv_file_SPOT)
df_GOOGL = pd.read_csv(csv_file_GOOGL)
df_BTC = pd.read_csv(csv_file_BTC)

## Data cleaning

In [10]:
# extract closing price
df_SPOT.drop(columns=['open', 'high', 'low', 'volume'], inplace=True)
df_GOOGL.drop(columns=['open', 'high', 'low', 'volume'], inplace=True)
df_BTC.drop(columns=['open', 'high', 'low', 'volume'], inplace=True)

df_SPOT.rename(columns={'close': 'price'}, inplace=True)
df_GOOGL.rename(columns={'close': 'price'}, inplace=True)
df_BTC.rename(columns={'close': 'price'}, inplace=True)

df_SPOT['timestamp']=pd.to_datetime(df_SPOT['timestamp'])
df_GOOGL['timestamp']=pd.to_datetime(df_GOOGL['timestamp'])
df_BTC['timestamp']=pd.to_datetime(df_BTC['timestamp'])

Unnamed: 0,timestamp,price
0,2024-12-16,484.90
1,2024-12-13,483.31
2,2024-12-12,480.11
3,2024-12-11,476.91
4,2024-12-10,471.58
...,...,...
1684,2018-04-09,150.00
1685,2018-04-06,147.92
1686,2018-04-05,143.99
1687,2018-04-04,144.22


In [11]:
df_SPOT.dropna(inplace=True)
df_GOOGL.dropna(inplace=True)
df_BTC.dropna(inplace=True)

## Define Data Validation Rules 


In [12]:
import great_expectations as ge

# Create an Expectation Suite
sp_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="sp_expectation_suite"
)

# Add an expectation to check that all prices are >= 0
sp_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "price",
            "min_value": 0.01,  # Set minimum value to 0.01 to exclude 0 and negatives
        }
    )
)


{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "price", "min_value": 0.01}, "meta": {}}

## Connect to Hopsworks

In [13]:
project

Project('StockPrediction', 'theresa.hoesl@t-online.de', 'Default project')

In [14]:
fs = project.get_feature_store() 

## Create the Feature Groups and insert the DataFrames in them

In [21]:
SPOT_fg = fs.get_or_create_feature_group(
    name='spot',
    description='Spotify Stock Prices',
    version=1,
    primary_key=['price'],
    event_time="timestamp",
    expectation_suite=sp_expectation_suite
)

GOOGL_fg = fs.get_or_create_feature_group(
    name='googl',
    description='Google Stock Prices',
    version=1,
    primary_key=['price'],
    event_time="timestamp",
    expectation_suite=sp_expectation_suite
)

BTC_fg = fs.get_or_create_feature_group(
    name='btc',
    description='Bitcoin Prices',
    version=1,
    primary_key=['price'],
    event_time="timestamp",
    expectation_suite=sp_expectation_suite
)

In [22]:
SPOT_fg.insert(df_SPOT)
GOOGL_fg.insert(df_GOOGL)
BTC_fg.insert(df_BTC)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1205424/fs/1194062/fg/1394596
2025-01-05 17:16:52,232 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1205424/fs/1194062/fg/1394596


Uploading Dataframe: 100.00% |██████████| Rows 1689/1689 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: spot_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1205424/jobs/named/spot_1_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1205424/fs/1194062/fg/1393570
2025-01-05 17:17:08,194 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1205424/fs/1194062/fg/1393570


Uploading Dataframe: 100.00% |██████████| Rows 5117/5117 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: googl_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1205424/jobs/named/googl_1_offline_fg_materialization/executions
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1205424/fs/1194062/fg/1394597
2025-01-05 17:17:24,090 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1205424/fs/1194062/fg/1394597


Uploading Dataframe: 100.00% |██████████| Rows 350/350 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: btc_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1205424/jobs/named/btc_1_offline_fg_materialization/executions


(Job('btc_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "price",
           "min_value": 0.01
         },
         "meta": {
           "expectationId": 694459
         }
       },
       "result": {
         "observed_value": 39524.27,
         "element_count": 350,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2025-01-05T04:17:24.000090Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,
     "successful_expectations": 1,
     "unsuccessful_expectations": 0,
     "success_pe

#### Enter a description for each feature in the Feature Group

In [84]:
SPOT_fg.update_feature_description("timestamp", "Day of data")
SPOT_fg.update_feature_description("price", "closing price")

GOOGL_fg.update_feature_description("timestamp", "Day of data")
GOOGL_fg.update_feature_description("price", "closing price")

BTC_fg.update_feature_description("timestamp", "Day of data")
BTC_fg.update_feature_description("price", "closing price")


<hsfs.feature_group.FeatureGroup at 0x26cbe4e7250>

## <span style="color:#ff5f27;">⏭️ **Next:** Part 02: Daily Feature Pipeline 
 </span> 


---