In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hopsworks

In [None]:
from sqlalchemy import create_engine, inspect
from utils.database_utils import *
from utils.config_utils import *
from constants import *

db_config = get_db_config(read_config(CONFIG_FILE_PATH))
engine = connect(db_config)

inspector = inspect(engine)

table_names = inspector.get_table_names()
table_names

Connection to PostgreSQL successful


['routes_table',
 'routes_weather',
 'drivers_table',
 'trucks_table',
 'city_weather',
 'truck_schedule_table',
 'traffic_table']

In [3]:
feature_descriptions = {
    "city_weather_fg": [
        {"name": "id", "description": "unique identification for each weather record"},
        {"name": "city_id", "description": "unique identification for each city"},
        {"name": "date", "description": "date of the weather observation"},
        {"name": "hour", "description": "hour of the weather observation (military time, 0-2300)"},
        {"name": "temp", "description": "temperature at the time of the weather observation, in Fahrenheit"},
        {"name": "wind_speed", "description": "wind speed during the observation, in miles per hour"},
        {"name": "description", "description": "brief description of the weather condition (e.g., Sunny, Cloudy)"},
        {"name": "precip", "description": "precipitation level during the observation, in inches"},
        {"name": "humidity", "description": "humidity percentage during the observation"},
        {"name": "visibility", "description": "visibility in miles at the time of the observation"},
        {"name": "pressure", "description": "atmospheric pressure at the time of the observation, in millibars"},
        {"name": "chanceofrain", "description": "chance of rain during the observation, as a percentage"},
        {"name": "chanceoffog", "description": "chance of fog during the observation, as a percentage"},
        {"name": "chanceofsnow", "description": "chance of snow during the observation, as a percentage"},
        {"name": "chanceofthunder", "description": "chance of thunder during the observation, as a percentage"},
        {"name": "event_time", "description": "dummy event time for this weather record"}
    ],
    "drivers_table_fg": [
        {"name": "driver_id", "description": "unique identification for each driver"},
        {"name": "name", "description": "name of the truck driver"},
        {"name": "gender", "description": "gender of the truck driver"},
        {"name": "age", "description": "age of the truck driver"},
        {"name": "experience", "description": "experience of the truck driver in years"},
        {"name": "driving_style", "description": "driving style of the truck driver, conservative or proactive"},
        {"name": "ratings", "description": "average rating of the truck driver on a scale of 1 to 10"},
        {"name": "vehicle_no", "description": "the number of the driver’s truck"},
        {"name": "average_speed_mph", "description": "average speed of the truck driver in miles per hour"},
        {"name": "event_time", "description": "dummy event time"}
    ],
    "trucks_table_fg": [
        {"name": "id", "description": "unique identification for each truck record"},
        {"name": "truck_id", "description": "unique identification for each truck"},
        {"name": "truck_age", "description": "age of the truck in years"},
        {"name": "load_capacity_pounds", "description": "maximum load capacity of the truck in pounds (some values may be missing)"},
        {"name": "mileage_mpg", "description": "truck's fuel efficiency measured in miles per gallon"},
        {"name": "fuel_type", "description": "type of fuel used by the truck (e.g., gas, diesel)"},
        {"name": "event_time", "description": "the timestamp when the event or record was created"}
    ],
    "routes_table_fg": [
        {"name": "id", "description": "unique identification for each route record"},
        {"name": "route_id", "description": "unique identification for each route"},
        {"name": "origin_id", "description": "unique identification for the origin city or location"},
        {"name": "destination_id", "description": "unique identification for the destination city or location"},
        {"name": "distance", "description": "distance between origin and destination in miles"},
        {"name": "average_hours", "description": "average travel time between origin and destination in hours"},
        {"name": "event_time", "description": "the timestamp when the event or record was created"}
    ],
    "traffic_table_fg": [
        {"name": "id", "description": "unique identification for each route activity record"},
        {"name": "route_id", "description": "unique identification for each route"},
        {"name": "date", "description": "date of the route activity"},
        {"name": "hour", "description": "hour of the activity (military time, e.g., 500 = 5:00 AM)"},
        {"name": "no_of_vehicles", "description": "number of vehicles on the route during the recorded hour"},
        {"name": "accident", "description": "whether an accident occurred (0 for no accident, 1 for accident)"},
        {"name": "event_time", "description": "the timestamp when the event or record was created"}
    ],
    "truck_schedule_table_fg": [
        {"name": "id", "description": "unique identification for each truck schedule record"},
        {"name": "truck_id", "description": "unique identification for each truck"},
        {"name": "route_id", "description": "unique identification for each route"},
        {"name": "departure_date", "description": "the departure date and time of the truck"},
        {"name": "estimated_arrival", "description": "the estimated arrival date and time of the truck"},
        {"name": "delay", "description": "whether the truck was delayed (0 for no delay, 1 for delayed)"},
        {"name": "event_time", "description": "the timestamp when the event or record was created"}
    ],
    "routes_weather_fg": [
        {"name": "id", "description": "unique identification for each weather record on the route"},
        {"name": "route_id", "description": "unique identification for each route"},
        {"name": "date", "description": "date and time of the weather observation"},
        {"name": "temp", "description": "temperature at the time of the weather observation, in Fahrenheit"},
        {"name": "wind_speed", "description": "wind speed during the observation, in miles per hour"},
        {"name": "description", "description": "brief description of the weather condition (e.g., Sunny, Rain Shower)"},
        {"name": "precip", "description": "precipitation level during the observation, in inches"},
        {"name": "humidity", "description": "humidity percentage during the observation"},
        {"name": "visibility", "description": "visibility in miles at the time of the observation"},
        {"name": "pressure", "description": "atmospheric pressure at the time of the observation, in millibars"},
        {"name": "chanceofrain", "description": "chance of rain during the observation, as a percentage"},
        {"name": "chanceoffog", "description": "chance of fog during the observation, as a percentage"},
        {"name": "chanceofsnow", "description": "chance of snow during the observation, as a percentage"},
        {"name": "chanceofthunder", "description": "chance of thunder during the observation, as a percentage"},
        {"name": "event_time", "description": "the timestamp when the event or record was created"}
    ]
}

In [None]:
import hopsworks
import pandas as pd

project = hopsworks.login(api_key_value = 'S2PoqNKdqdQ2EXNW.hybyaHPCvA5Jx6uoeL55fG2X5u3ogs3yUxcPmkGUKOc6DsNapeRmD3L6PlpefmOz')
fs = project.get_feature_store()

feature_groups = [table + '_fg' for table in table_names]

# feature_dataframes = {}

for fg_name in feature_groups:
    fg = fs.get_feature_group(fg_name)  
    
    descriptions = feature_descriptions.get(fg_name, [])
    
    for desc in descriptions:
        fg.update_feature_description(desc["name"], desc["description"])
        
        
    fg.statistics_config = {
        "enabled": True,        
        "histograms": True,     
        "correlations": True    
    }
    
    fg.update_statistics_config()
    
    fg.compute_statistics()
    # df = fg.read()
    
    # feature_dataframes[fg_name] = df

# print(feature_dataframes)

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1022104
Connected. Call `.close()` to terminate connection gracefully.

Statistics Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1022104/jobs/named/routes_table_fg_1_compute_stats_26092024185237/executions

Statistics Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1022104/jobs/named/routes_weather_fg_1_compute_stats_26092024185345/executions

Statistics Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1022104/jobs/named/drivers_table_fg_1_compute_stats_26092024185518/executions

Statistics Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1022104/jobs/named/trucks_table_fg_1_compute_stats_26092024185623/executions

Statistics Job started successfully, you can follow the progress at