In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/fingrid from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('fingrid',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: c:\Users\patri\VScodeProjects\25-ID2223-mlfs-book
Added the following directory to the PYTHONPATH: c:\Users\patri\VScodeProjects\25-ID2223-mlfs-book
HopsworksSettings initialized!


<span style="font-width:bold; font-size: 3rem; color:#333;">- Part 01: Feature Backfill for Fingrid Energy Consumption Data</span>


### <span style='color:#ff5f27'> Imports

In [2]:
import datetime
import requests
import pandas as pd
import numpy as np
import hopsworks
from datetime import datetime, timedelta
from pathlib import Path
import os
import time
import warnings
warnings.filterwarnings("ignore")

# Define today's date
today = datetime.today()

# Get API keys from settings
FINGRID_API_KEY = settings.FINGRID_KEY
if FINGRID_API_KEY is None:
    print("Warning: FINGRID_KEY not found in .env file. You'll need it to fetch energy data.")
else:
    # Convert SecretStr to plain string for use with requests library
    FINGRID_API_KEY = FINGRID_API_KEY.get_secret_value()

## <span style='color:#ff5f27'> STEP 1: Get your Fingrid API Token and Store it in .env file</span>

You need to get your Fingrid API key from https://data.fingrid.fi/en/

Once you have your API key, save it to the .env file in the root directory of your project:

 * mv .env.example .env
 * edit .env

In the .env file, add or update:

`FINGRID_KEY="put your Fingrid API KEY value in this string"`

In [3]:
project = hopsworks.login()

2026-01-01 20:41:52,160 INFO: Initializing external client
2026-01-01 20:41:52,161 INFO: Base URL: https://c.app.hopsworks.ai:443






2026-01-01 20:41:54,289 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1286359


## Hopsworks API Key

You need to have registered an account on app.hopsworks.ai.

Save the HOPSWORKS_API_KEY to the .env file in the root directory of your project:

 * mv .env.example .env
 * edit .env

In the .env file, update HOPSWORKS_API_KEY:

`HOPSWORKS_API_KEY="put API KEY value in this string"`

In [4]:
# Configuration
FINGRID_BASE_URL = "https://data.fingrid.fi/api/datasets"
DATASET_ID = "193"  # Electricity consumption in Finland (MW)

# Weather location (Helsinki coordinates as representative measurement point for Finland)
COUNTRY = "Finland"
CITY = "Helsinki"
LATITUDE = 60.1699
LONGITUDE = 24.9384

# Backfill configuration - how many days of historical data to download
BACKFILL_DAYS = 730  # 2 years of data for complete seasonal coverage
end_date = today
start_date = end_date - timedelta(days=BACKFILL_DAYS)

# Local storage configuration
DATA_DIR = f"{root_dir}/data"
os.makedirs(DATA_DIR, exist_ok=True)

print(f"Country: {COUNTRY}")
print(f"Weather measurement location: {CITY} ({LATITUDE}, {LONGITUDE})")
print(f"Backfill period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print(f"Local data directory: {DATA_DIR}")

Country: Finland
Weather measurement location: Helsinki (60.1699, 24.9384)
Backfill period: 2024-01-02 to 2026-01-01
Local data directory: c:\Users\patri\VScodeProjects\25-ID2223-mlfs-book/data


---

## <span style='color:#ff5f27'> STEP 2: Download Historical Energy Consumption Data from Fingrid</span>

We will fetch historical electricity consumption data from Fingrid's open data API.
The data is measured every 3 hours and represents Finland's nationwide electricity consumption in megawatts (MW).

In [5]:
def fetch_fingrid_data(dataset_id, start_date, end_date, api_key):
    """
    Fetch historical data from Fingrid API.
    
    Args:
        dataset_id: Fingrid dataset ID
        start_date: Start date (datetime)
        end_date: End date (datetime)
        api_key: Fingrid API key
    
    Returns:
        pandas DataFrame with timestamp and value columns
    """
    url = f"{FINGRID_BASE_URL}/{dataset_id}/data"
    
    params = {
        "startTime": start_date.strftime("%Y-%m-%dT%H:%M:%SZ"),
        "endTime": end_date.strftime("%Y-%m-%dT%H:%M:%SZ"),
        "format": "json",
        "pageSize": 20000  # Max records per request
    }
    
    headers = {
        "x-api-key": api_key,
        "Accept": "application/json"
    }
    
    print(f"Fetching energy data from {params['startTime']} to {params['endTime']}...")
    
    try:
        response = requests.get(url, headers=headers, params=params, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        if 'data' in data:
            df = pd.DataFrame(data['data'])
            print(f"Fetched {len(df)} records from Fingrid API")
            return df
        else:
            print(f"Warning: No 'data' field in response: {list(data.keys())}")
            return pd.DataFrame()
            
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

def fetch_weather_data(latitude, longitude, start_date, end_date):
    """
    Fetch historical weather data from Open-Meteo API.
    Free API - no key required!
    
    Args:
        latitude: Location latitude
        longitude: Location longitude
        start_date: Start date (datetime)
        end_date: End date (datetime)
    
    Returns:
        pandas DataFrame with weather variables
    """
    url = "https://archive-api.open-meteo.com/v1/archive"
    
    # Format dates for API
    start_str = start_date.strftime("%Y-%m-%d")
    end_str = end_date.strftime("%Y-%m-%d")
    
    # Weather variables optimized for energy forecasting
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_str,
        "end_date": end_str,
        "hourly": [
            "temperature_2m",
            "precipitation",
            "cloud_cover",
            "wind_speed_10m",
            "wind_speed_100m",
            "wind_direction_10m",
            "surface_pressure",
            "shortwave_radiation"
        ],
        "timezone": "Europe/Helsinki"
    }
    
    print(f"Fetching weather data from {start_str} to {end_str}...")
    
    try:
        response = requests.get(url, params=params, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        if 'hourly' in data:
            # Convert to DataFrame
            hourly = data['hourly']
            df = pd.DataFrame({
                'date': pd.to_datetime(hourly['time']),
                'temperature_2m': hourly['temperature_2m'],
                'precipitation': hourly['precipitation'],
                'cloud_cover': hourly['cloud_cover'],
                'wind_speed_10m': hourly['wind_speed_10m'],
                'wind_speed_100m': hourly['wind_speed_100m'],
                'wind_direction_10m': hourly['wind_direction_10m'],
                'surface_pressure': hourly['surface_pressure'],
                'shortwave_radiation': hourly['shortwave_radiation']
            })
            
            print(f"Fetched {len(df)} hourly weather records")
            return df
        else:
            print(f"Warning: No 'hourly' field in response: {list(data.keys())}")
            return pd.DataFrame()
            
    except requests.exceptions.RequestException as e:
        print(f"Error fetching weather data: {e}")
        return pd.DataFrame()

print("Data fetching functions defined")

Data fetching functions defined


### <span style='color:#ff5f27'> Fetch Energy Data from Fingrid API</span>

We fetch data in chunks to respect API rate limits and avoid timeouts.

In [6]:
# Fetch data in chunks if needed (Fingrid API may have limits)
all_data = []
current_start = start_date
chunk_days = 7  # Fetch 7 days at a time

while current_start < end_date:
    current_end = min(current_start + timedelta(days=chunk_days), end_date)
    
    df_chunk = fetch_fingrid_data(
        dataset_id=DATASET_ID,
        start_date=current_start,
        end_date=current_end,
        api_key=FINGRID_API_KEY
    )
    
    if not df_chunk.empty:
        all_data.append(df_chunk)
    
    current_start = current_end
    
    # Add delay to respect rate limits
    if current_start < end_date:
        time.sleep(2)

# Combine all chunks
if all_data:
    df_raw = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal records fetched: {len(df_raw)}")
    df_raw.head()
else:
    raise ValueError("No data fetched. Check API key and date range.")

Fetching energy data from 2024-01-02T20:41:52Z to 2024-01-09T20:41:52Z...
Fetched 3360 records from Fingrid API
Fetching energy data from 2024-01-09T20:41:52Z to 2024-01-16T20:41:52Z...
Fetched 6420 records from Fingrid API
Fetching energy data from 2024-01-16T20:41:52Z to 2024-01-23T20:41:52Z...
Fetched 6678 records from Fingrid API
Fetching energy data from 2024-01-23T20:41:52Z to 2024-01-30T20:41:52Z...
Fetched 6718 records from Fingrid API
Fetching energy data from 2024-01-30T20:41:52Z to 2024-02-06T20:41:52Z...
Fetched 6712 records from Fingrid API
Fetching energy data from 2024-02-06T20:41:52Z to 2024-02-13T20:41:52Z...
Fetched 6716 records from Fingrid API
Fetching energy data from 2024-02-13T20:41:52Z to 2024-02-20T20:41:52Z...
Fetched 6664 records from Fingrid API
Fetching energy data from 2024-02-20T20:41:52Z to 2024-02-27T20:41:52Z...
Fetched 6716 records from Fingrid API
Fetching energy data from 2024-02-27T20:41:52Z to 2024-03-05T20:41:52Z...
Fetched 6553 records from Fing

---

## <span style='color:#ff5f27'> STEP 3: Download Historical Weather Data </span>

We download historical weather data for Helsinki (representative location for Finland) using the free Open-Meteo API.

The weather features we download are optimized for energy forecasting:
 * `temperature` - impacts heating/cooling demand
 * `precipitation` - affects hydropower generation
 * `wind speed` - important for wind power generation
 * `solar radiation` - affects solar power generation
 * `cloud cover` - impacts solar generation
 * `surface pressure` - general weather indicator

In [7]:
# Fetch weather data
df_weather_raw = fetch_weather_data(
    latitude=LATITUDE,
    longitude=LONGITUDE,
    start_date=start_date,
    end_date=end_date
)

if not df_weather_raw.empty:
    print(f"\nTotal weather records fetched: {len(df_weather_raw)}")
    print(f"Date range: {df_weather_raw['date'].min()} to {df_weather_raw['date'].max()}")
    df_weather_raw.head()
else:
    raise ValueError("No weather data fetched. Check date range and network connection.")

Fetching weather data from 2024-01-02 to 2026-01-01...
Fetched 17544 hourly weather records

Total weather records fetched: 17544
Date range: 2024-01-02 00:00:00 to 2026-01-01 23:00:00


## <span style='color:#ff5f27'> STEP 4: Process Weather Data </span>

We resample the hourly weather data to 3-hour intervals to match the frequency of the energy consumption data.

In [8]:
def process_weather_data(df):
    """
    Process weather data to match energy consumption frequency.
    Resample hourly data to 3-hour intervals to match Fingrid data.
    """
    df = df.copy()
    
    # Set date as index for resampling
    df.set_index('date', inplace=True)
    
    # Resample to 3-hour intervals
    df_resampled = df.resample('3H').agg({
        'temperature_2m': 'mean',
        'precipitation': 'sum',
        'cloud_cover': 'mean',
        'wind_speed_10m': 'mean',
        'wind_speed_100m': 'mean',
        'wind_direction_10m': 'mean',
        'surface_pressure': 'mean',
        'shortwave_radiation': 'mean'
    })
    
    # Reset index to make date a column
    df_resampled.reset_index(inplace=True)
    
    # Add country identifier
    df_resampled['country'] = COUNTRY
    
    # Drop any NaN rows
    df_resampled = df_resampled.dropna()
    
    return df_resampled

# Process the weather data
df_weather_processed = process_weather_data(df_weather_raw)

print(f"Weather data processed: {len(df_weather_processed)} records (3-hour intervals)")
df_weather_processed.head()

Weather data processed: 5848 records (3-hour intervals)


Unnamed: 0,date,temperature_2m,precipitation,cloud_cover,wind_speed_10m,wind_speed_100m,wind_direction_10m,surface_pressure,shortwave_radiation,country
0,2024-01-02 00:00:00,-15.666667,0.0,100.0,25.233333,38.633333,37.333333,1022.1,0.0,Finland
1,2024-01-02 03:00:00,-15.233333,0.0,100.0,26.666667,40.9,41.0,1021.9,0.0,Finland
2,2024-01-02 06:00:00,-15.066667,0.0,99.333333,25.966667,39.8,42.0,1021.866667,0.0,Finland
3,2024-01-02 09:00:00,-15.833333,0.0,100.0,24.033333,38.266667,39.0,1022.5,8.666667,Finland
4,2024-01-02 12:00:00,-15.133333,0.0,100.0,22.866667,36.4,39.0,1022.433333,41.0,Finland


---

## <span style='color:#ff5f27'> STEP 5: Process Energy Consumption Data </span>

We process the raw energy data and add temporal features to help with forecasting.

In [9]:
def process_energy_data(df):
    """
    Process and engineer features from raw Fingrid data.
    """
    df = df.copy()
    
    # Parse timestamp
    timestamp_col = 'start_time' if 'start_time' in df.columns else 'startTime'
    if timestamp_col not in df.columns:
        timestamp_col = df.columns[0]
    
    df['date'] = pd.to_datetime(df[timestamp_col])
    
    # Get consumption value
    value_col = 'value' if 'value' in df.columns else df.columns[1]
    df['consumption_mw'] = pd.to_numeric(df[value_col], errors='coerce')
    
    # Remove duplicates and sort
    df = df.drop_duplicates(subset=['date'])
    df = df.sort_values('date')
    
    # Extract temporal features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    df['week_of_year'] = df['date'].dt.isocalendar().week
    
    # Create lag features
    df['consumption_lag_1'] = df['consumption_mw'].shift(1)
    df['consumption_lag_2'] = df['consumption_mw'].shift(2)
    df['consumption_lag_8'] = df['consumption_mw'].shift(8)
    
    # Rolling statistics
    df['consumption_rolling_mean_24h'] = df['consumption_mw'].rolling(window=8, min_periods=1).mean()
    df['consumption_rolling_std_24h'] = df['consumption_mw'].rolling(window=8, min_periods=1).std()
    
    # Drop rows with NaN
    df = df.dropna(subset=['date', 'consumption_mw'])
    
    # Add country identifier
    df['country'] = COUNTRY
    
    # Select final columns
    feature_columns = [
        'date',
        'consumption_mw',
        'country',
        'year',
        'month',
        'day',
        'hour',
        'day_of_week',
        'is_weekend',
        'week_of_year',
        'consumption_lag_1',
        'consumption_lag_2',
        'consumption_lag_8',
        'consumption_rolling_mean_24h',
        'consumption_rolling_std_24h'
    ]
    
    df = df[feature_columns]
    
    return df

df_processed = process_energy_data(df_raw)

print(f"Data processed: {len(df_processed)} records")
df_processed.head()

Data processed: 383044 records


Unnamed: 0,date,consumption_mw,country,year,month,day,hour,day_of_week,is_weekend,week_of_year,consumption_lag_1,consumption_lag_2,consumption_lag_8,consumption_rolling_mean_24h,consumption_rolling_std_24h
3359,2024-01-02 20:43:00+00:00,14400.0,Finland,2024,1,2,20,1,0,1,,,,14400.0,
3358,2024-01-02 20:46:00+00:00,14366.0,Finland,2024,1,2,20,1,0,1,14400.0,,,14383.0,24.041631
3357,2024-01-02 20:49:00+00:00,14331.0,Finland,2024,1,2,20,1,0,1,14366.0,14400.0,,14365.666667,34.501208
3356,2024-01-02 20:52:00+00:00,14277.0,Finland,2024,1,2,20,1,0,1,14331.0,14366.0,,14343.5,52.526184
3355,2024-01-02 20:55:00+00:00,14385.0,Finland,2024,1,2,20,1,0,1,14277.0,14331.0,,14351.8,49.129421


---

## <span style='color:#ff5f27'> STEP 6: Define Data Validation Rules </span>

We define data validation rules (expectations) to ensure data quality before writing to Hopsworks.
This prevents garbage-in, garbage-out scenarios.

In [10]:
import great_expectations as ge

energy_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="energy_expectation_suite"
)

energy_expectation_suite.add_expectation(
    ge.core.ExpectationConfiguration(
        expectation_type="expect_column_min_to_be_between",
        kwargs={
            "column": "consumption_mw",
            "min_value": 0.0,
            "max_value": 20000.0,
            "strict_min": True
        }
    )
)

{"expectation_type": "expect_column_min_to_be_between", "kwargs": {"column": "consumption_mw", "min_value": 0.0, "max_value": 20000.0, "strict_min": true}, "meta": {}}

In [11]:
weather_expectation_suite = ge.core.ExpectationSuite(
    expectation_suite_name="weather_expectation_suite"
)

def expect_reasonable_weather(col, min_val, max_val):
    weather_expectation_suite.add_expectation(
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_min_to_be_between",
            kwargs={
                "column": col,
                "min_value": min_val,
                "max_value": max_val,
                "strict_min": True
            }
        )
    )

expect_reasonable_weather("temperature_2m", -50.0, 50.0)
expect_reasonable_weather("precipitation", 0.0, 500.0)
expect_reasonable_weather("wind_speed_10m", 0.0, 100.0)

---

## <span style='color:#ff5f27'> STEP 7: Save Processed Data Locally </span>

Before uploading to Hopsworks, save the processed dataframes as CSV files in the data directory for backup and offline analysis.

The files will be saved as:
 * `energy_consumption_finland.csv` - Energy consumption with features (2 years)
 * `weather_finland_historical.csv` - Historical weather data (2 years)

In [12]:
# Save processed data to CSV files in the data directory
# Using unique names to avoid conflicts with other project data
energy_file = f"{DATA_DIR}/energy_consumption_finland.csv"
weather_file = f"{DATA_DIR}/weather_finland_historical.csv"

df_processed.to_csv(energy_file, index=False)
df_weather_processed.to_csv(weather_file, index=False)

print(f"✓ Saved energy data to: {energy_file}")
print(f"  - {len(df_processed)} records from {df_processed['date'].min()} to {df_processed['date'].max()}")
print(f"✓ Saved weather data to: {weather_file}")
print(f"  - {len(df_weather_processed)} records from {df_weather_processed['date'].min()} to {df_weather_processed['date'].max()}")
print(f"\n✓ All data saved successfully in: {DATA_DIR}")

✓ Saved energy data to: c:\Users\patri\VScodeProjects\25-ID2223-mlfs-book/data/energy_consumption_finland.csv
  - 383044 records from 2024-01-02 20:43:00+00:00 to 2026-01-01 19:42:00+00:00
✓ Saved weather data to: c:\Users\patri\VScodeProjects\25-ID2223-mlfs-book/data/weather_finland_historical.csv
  - 5848 records from 2024-01-02 00:00:00 to 2026-01-01 21:00:00

✓ All data saved successfully in: c:\Users\patri\VScodeProjects\25-ID2223-mlfs-book/data


---

## <span style="color:#ff5f27;"> STEP 8: Connect to Hopsworks Feature Store</span>

Now we'll connect to Hopsworks and upload the processed data to feature groups.

In [13]:
fs = project.get_feature_store() 

try:
    secrets = project.get_secrets_api()
except AttributeError:
    try:
        secrets = project.get_secret_store()
    except AttributeError:
        secrets = hopsworks.get_secrets_api()

print(f"Connected to Hopsworks Feature Store: {fs.name}")

Connected to Hopsworks Feature Store: id2223_10_featurestore


## <span style='color:#ff5f27'> STEP 9: Create Feature Groups and Upload Data </span>

### <span style='color:#ff5f27'> Energy Consumption Data

In [14]:
energy_fg = fs.get_or_create_feature_group(
    name="energy_consumption",
    version=1,
    description="Historical electricity consumption data from Fingrid with temporal and lag features",
    primary_key=['country'],
    event_time='date',
    expectation_suite=energy_expectation_suite
)

In [15]:
energy_fg.insert(df_processed, write_options={"wait_for_job": True})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286359/fs/1284182/fg/1880554
2026-01-01 20:47:02,655 INFO: 	1 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286359/fs/1284182/fg/1880554


Uploading Dataframe: 100.00% |██████████| Rows 383044/383044 | Elapsed Time: 00:13 | Remaining Time: 00:00


Launching job: energy_consumption_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286359/jobs/named/energy_consumption_1_offline_fg_materialization/executions
2026-01-01 20:47:31,523 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2026-01-01 20:47:34,729 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-01 20:49:54,822 INFO: Waiting for log aggregation to finish.
2026-01-01 20:50:06,807 INFO: Execution finished successfully.


(Job('energy_consumption_1_offline_fg_materialization', 'SPARK'),
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "consumption_mw",
           "min_value": 0.0,
           "max_value": 20000.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 801843
         }
       },
       "result": {
         "observed_value": 5685.8,
         "element_count": 383044,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2026-01-01T07:47:02.000654Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     }
   ],
   "evaluation_parameters": {},
   "statistics": {
     "evaluated_expectations": 1,


In [16]:
energy_fg.update_feature_description("date", "Timestamp of measurement (3-hour intervals)")
energy_fg.update_feature_description("consumption_mw", "Electricity consumption in Finland (megawatts)")
energy_fg.update_feature_description("country", "Country where consumption was measured")
energy_fg.update_feature_description("year", "Year extracted from date")
energy_fg.update_feature_description("month", "Month of year (1-12)")
energy_fg.update_feature_description("day", "Day of month")
energy_fg.update_feature_description("hour", "Hour of day")
energy_fg.update_feature_description("day_of_week", "Day of week (0=Monday, 6=Sunday)")
energy_fg.update_feature_description("is_weekend", "1 if weekend, 0 if weekday")
energy_fg.update_feature_description("week_of_year", "Week number in the year")
energy_fg.update_feature_description("consumption_lag_1", "Consumption 3 hours ago")
energy_fg.update_feature_description("consumption_lag_2", "Consumption 6 hours ago")
energy_fg.update_feature_description("consumption_lag_8", "Consumption 24 hours ago")
energy_fg.update_feature_description("consumption_rolling_mean_24h", "24-hour rolling mean consumption")
energy_fg.update_feature_description("consumption_rolling_std_24h", "24-hour rolling standard deviation")

<hsfs.feature_group.FeatureGroup at 0x21898aff410>

### <span style='color:#ff5f27'> Weather Data


In [17]:
weather_fg = fs.get_or_create_feature_group(
    name="weather_finland",
    version=1,
    description="Historical weather data from Open-Meteo for Finland (measured in Helsinki)",
    primary_key=['country'],
    event_time='date',
    expectation_suite=weather_expectation_suite
)

In [18]:
weather_fg.insert(df_weather_processed, write_options={"wait_for_job": True})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1286359/fs/1284182/fg/1880556
2026-01-01 20:50:19,972 INFO: 	3 expectation(s) included in expectation_suite.
Validation failed.
Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/1286359/fs/1284182/fg/1880556


Uploading Dataframe: 100.00% |██████████| Rows 5848/5848 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: weather_finland_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1286359/jobs/named/weather_finland_1_offline_fg_materialization/executions
2026-01-01 20:50:37,980 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2026-01-01 20:50:41,157 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-01 20:52:25,936 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2026-01-01 20:52:26,099 INFO: Waiting for log aggregation to finish.
2026-01-01 20:52:34,771 INFO: Execution finished successfully.


(Job('weather_finland_1_offline_fg_materialization', 'SPARK'),
 {
   "success": false,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_min_to_be_between",
         "kwargs": {
           "column": "temperature_2m",
           "min_value": -50.0,
           "max_value": 50.0,
           "strict_min": true
         },
         "meta": {
           "expectationId": 801844
         }
       },
       "result": {
         "observed_value": -25.46666666666667,
         "element_count": 5848,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2026-01-01T07:50:19.000972Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expectation

In [19]:
weather_fg.update_feature_description("date", "Timestamp of weather measurement")
weather_fg.update_feature_description("country", "Country where weather is measured")
weather_fg.update_feature_description("temperature_2m", "Temperature in Celsius at 2m height")
weather_fg.update_feature_description("precipitation", "Precipitation in mm (3-hour total)")
weather_fg.update_feature_description("cloud_cover", "Cloud cover percentage (0-100)")
weather_fg.update_feature_description("wind_speed_10m", "Wind speed at 10m above ground (km/h)")
weather_fg.update_feature_description("wind_speed_100m", "Wind speed at 100m above ground (km/h) - important for wind turbines")
weather_fg.update_feature_description("wind_direction_10m", "Wind direction in degrees")
weather_fg.update_feature_description("surface_pressure", "Surface atmospheric pressure in hPa")
weather_fg.update_feature_description("shortwave_radiation", "Solar radiation in W/m²")

<hsfs.feature_group.FeatureGroup at 0x21895b7c150>

---

In [20]:
# Verify feature groups exist and have data
print("Energy Consumption Feature Group:")
print(f"  Name: {energy_fg.name}")
print(f"  Version: {energy_fg.version}")
print(f"  Features: {[f.name for f in energy_fg.features]}")

print("\nWeather Feature Group:")
print(f"  Name: {weather_fg.name}")
print(f"  Version: {weather_fg.version}")
print(f"  Features: {[f.name for f in weather_fg.features]}")

# Check if both have 'country' as join key
print("\nJoin keys:")
print(f"  Energy FG primary keys: {energy_fg.primary_key}")
print(f"  Weather FG primary keys: {weather_fg.primary_key}")
print("\n✓ Both feature groups created successfully!")

Energy Consumption Feature Group:
  Name: energy_consumption
  Version: 1
  Features: ['date', 'consumption_mw', 'country', 'year', 'month', 'day', 'hour', 'day_of_week', 'is_weekend', 'week_of_year', 'consumption_lag_1', 'consumption_lag_2', 'consumption_lag_8', 'consumption_rolling_mean_24h', 'consumption_rolling_std_24h']

Weather Feature Group:
  Name: weather_finland
  Version: 1
  Features: ['date', 'temperature_2m', 'precipitation', 'cloud_cover', 'wind_speed_10m', 'wind_speed_100m', 'wind_direction_10m', 'surface_pressure', 'shortwave_radiation', 'country']

Join keys:
  Energy FG primary keys: ['country']
  Weather FG primary keys: ['country']

✓ Both feature groups created successfully!


## <span style='color:#ff5f27'> Verify Feature Groups Created </span>

Let's verify both feature groups were created successfully.