In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

In [None]:

import sys
import os

# Add the project root to sys.path (adjust the path as needed)
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)

# Now try to import the DataIngestion class
from src.components.data_ingestion import DataIngestion

# Initialize the DataIngestion object
ingestion = DataIngestion()

# Load data from the database
df_truck_schedule = ingestion.load_dataframe('truck_schedule_table')

# Display the data
print(df_truck_schedule.head())



In [3]:

df_truck_schedule = df

Unnamed: 0,truck_id,route_id,departure_date,estimated_arrival,delay
0,30312694,R-b236e347,2019-01-01 07:00:00,2019-01-01 13:13:12.000000000,0
1,59856374,R-29ea762e,2019-01-01 07:00:00,2019-01-02 04:01:12.000000000,0
2,12602955,R-a3d67783,2019-01-01 07:00:00,2019-01-01 07:45:36.000000000,0
3,46619422,R-31ec9310,2019-01-01 07:00:00,2019-01-01 20:46:48.000000000,0
4,10140178,R-a07c5dbd,2019-01-01 07:00:00,2019-01-01 21:34:11.999999999,0
...,...,...,...,...,...
12303,31047945,R-1484a7ea,2019-02-06 07:00:00,2019-02-13 22:40:12.000000000,0
12304,14758432,R-927cf900,2019-02-06 07:00:00,2019-02-13 18:52:48.000000000,1
12305,31370619,R-5a83ad98,2019-02-06 07:00:00,2019-02-14 08:01:48.000000000,1
12306,67332883,R-991530bc,2019-02-06 07:00:00,2019-02-13 11:14:24.000000000,1


In [4]:
df.shape

(12308, 5)

In [5]:
df.info

<bound method DataFrame.info of        truck_id    route_id       departure_date  \
0      30312694  R-b236e347  2019-01-01 07:00:00   
1      59856374  R-29ea762e  2019-01-01 07:00:00   
2      12602955  R-a3d67783  2019-01-01 07:00:00   
3      46619422  R-31ec9310  2019-01-01 07:00:00   
4      10140178  R-a07c5dbd  2019-01-01 07:00:00   
...         ...         ...                  ...   
12303  31047945  R-1484a7ea  2019-02-06 07:00:00   
12304  14758432  R-927cf900  2019-02-06 07:00:00   
12305  31370619  R-5a83ad98  2019-02-06 07:00:00   
12306  67332883  R-991530bc  2019-02-06 07:00:00   
12307  26630473  R-3f49bd43  2019-02-06 07:00:00   

                   estimated_arrival  delay  
0      2019-01-01 13:13:12.000000000      0  
1      2019-01-02 04:01:12.000000000      0  
2      2019-01-01 07:45:36.000000000      0  
3      2019-01-01 20:46:48.000000000      0  
4      2019-01-01 21:34:11.999999999      0  
...                              ...    ...  
12303  2019-02-13 22:

In [6]:
df.describe

<bound method NDFrame.describe of        truck_id    route_id       departure_date  \
0      30312694  R-b236e347  2019-01-01 07:00:00   
1      59856374  R-29ea762e  2019-01-01 07:00:00   
2      12602955  R-a3d67783  2019-01-01 07:00:00   
3      46619422  R-31ec9310  2019-01-01 07:00:00   
4      10140178  R-a07c5dbd  2019-01-01 07:00:00   
...         ...         ...                  ...   
12303  31047945  R-1484a7ea  2019-02-06 07:00:00   
12304  14758432  R-927cf900  2019-02-06 07:00:00   
12305  31370619  R-5a83ad98  2019-02-06 07:00:00   
12306  67332883  R-991530bc  2019-02-06 07:00:00   
12307  26630473  R-3f49bd43  2019-02-06 07:00:00   

                   estimated_arrival  delay  
0      2019-01-01 13:13:12.000000000      0  
1      2019-01-02 04:01:12.000000000      0  
2      2019-01-01 07:45:36.000000000      0  
3      2019-01-01 20:46:48.000000000      0  
4      2019-01-01 21:34:11.999999999      0  
...                              ...    ...  
12303  2019-02-13 2

In [7]:
df.head()

Unnamed: 0,truck_id,route_id,departure_date,estimated_arrival,delay
0,30312694,R-b236e347,2019-01-01 07:00:00,2019-01-01 13:13:12.000000000,0
1,59856374,R-29ea762e,2019-01-01 07:00:00,2019-01-02 04:01:12.000000000,0
2,12602955,R-a3d67783,2019-01-01 07:00:00,2019-01-01 07:45:36.000000000,0
3,46619422,R-31ec9310,2019-01-01 07:00:00,2019-01-01 20:46:48.000000000,0
4,10140178,R-a07c5dbd,2019-01-01 07:00:00,2019-01-01 21:34:11.999999999,0


In [8]:
#Checking the null values
print(df.isnull().sum())

truck_id             0
route_id             0
departure_date       0
estimated_arrival    0
delay                0
dtype: int64


In [9]:
#checking the duplicate values
df.duplicated().sum()

np.int64(0)

Hence no duplicate values

In [10]:
#Assessing the columns to remove outliers

df.delay.value_counts()

delay
0    8014
1    4294
Name: count, dtype: int64

In [11]:
df

Unnamed: 0,truck_id,route_id,departure_date,estimated_arrival,delay
0,30312694,R-b236e347,2019-01-01 07:00:00,2019-01-01 13:13:12.000000000,0
1,59856374,R-29ea762e,2019-01-01 07:00:00,2019-01-02 04:01:12.000000000,0
2,12602955,R-a3d67783,2019-01-01 07:00:00,2019-01-01 07:45:36.000000000,0
3,46619422,R-31ec9310,2019-01-01 07:00:00,2019-01-01 20:46:48.000000000,0
4,10140178,R-a07c5dbd,2019-01-01 07:00:00,2019-01-01 21:34:11.999999999,0
...,...,...,...,...,...
12303,31047945,R-1484a7ea,2019-02-06 07:00:00,2019-02-13 22:40:12.000000000,0
12304,14758432,R-927cf900,2019-02-06 07:00:00,2019-02-13 18:52:48.000000000,1
12305,31370619,R-5a83ad98,2019-02-06 07:00:00,2019-02-14 08:01:48.000000000,1
12306,67332883,R-991530bc,2019-02-06 07:00:00,2019-02-13 11:14:24.000000000,1


In [12]:
#Hence no columns to remove outliers from

In [13]:
#rename the df
df6 = df

# Adding a new index column with a different name
df6.insert(0, 'RowIndex', range(1, 1 + len(df6)))

# Display the DataFrame with the new index column
print(df6)

       RowIndex  truck_id    route_id       departure_date  \
0             1  30312694  R-b236e347  2019-01-01 07:00:00   
1             2  59856374  R-29ea762e  2019-01-01 07:00:00   
2             3  12602955  R-a3d67783  2019-01-01 07:00:00   
3             4  46619422  R-31ec9310  2019-01-01 07:00:00   
4             5  10140178  R-a07c5dbd  2019-01-01 07:00:00   
...         ...       ...         ...                  ...   
12303     12304  31047945  R-1484a7ea  2019-02-06 07:00:00   
12304     12305  14758432  R-927cf900  2019-02-06 07:00:00   
12305     12306  31370619  R-5a83ad98  2019-02-06 07:00:00   
12306     12307  67332883  R-991530bc  2019-02-06 07:00:00   
12307     12308  26630473  R-3f49bd43  2019-02-06 07:00:00   

                   estimated_arrival  delay  
0      2019-01-01 13:13:12.000000000      0  
1      2019-01-02 04:01:12.000000000      0  
2      2019-01-01 07:45:36.000000000      0  
3      2019-01-01 20:46:48.000000000      0  
4      2019-01-01 21:34:1

In [15]:
df6['event_time']= pd.to_datetime('2024-09-17')

In [16]:
df6

Unnamed: 0,RowIndex,truck_id,route_id,departure_date,estimated_arrival,delay,event_time
0,1,30312694,R-b236e347,2019-01-01 07:00:00,2019-01-01 13:13:12.000000000,0,2024-09-17
1,2,59856374,R-29ea762e,2019-01-01 07:00:00,2019-01-02 04:01:12.000000000,0,2024-09-17
2,3,12602955,R-a3d67783,2019-01-01 07:00:00,2019-01-01 07:45:36.000000000,0,2024-09-17
3,4,46619422,R-31ec9310,2019-01-01 07:00:00,2019-01-01 20:46:48.000000000,0,2024-09-17
4,5,10140178,R-a07c5dbd,2019-01-01 07:00:00,2019-01-01 21:34:11.999999999,0,2024-09-17
...,...,...,...,...,...,...,...
12303,12304,31047945,R-1484a7ea,2019-02-06 07:00:00,2019-02-13 22:40:12.000000000,0,2024-09-17
12304,12305,14758432,R-927cf900,2019-02-06 07:00:00,2019-02-13 18:52:48.000000000,1,2024-09-17
12305,12306,31370619,R-5a83ad98,2019-02-06 07:00:00,2019-02-14 08:01:48.000000000,1,2024-09-17
12306,12307,67332883,R-991530bc,2019-02-06 07:00:00,2019-02-13 11:14:24.000000000,1,2024-09-17


In [18]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12308 entries, 0 to 12307
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   RowIndex           12308 non-null  int64         
 1   truck_id           12308 non-null  int64         
 2   route_id           12308 non-null  object        
 3   departure_date     12308 non-null  object        
 4   estimated_arrival  12308 non-null  object        
 5   delay              12308 non-null  int64         
 6   event_time         12308 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 673.2+ KB


In [26]:
# Convert 'estimated_arrival' to datetime format if not already
df['estimated_arrival'] = pd.to_datetime(df['estimated_arrival'])

# Set minutes and seconds to '00:00:00' and format as 'YYYY-MM-DD HH:MM:SS'
df['estimated_arrival'] = df['estimated_arrival'].dt.floor('H')

# Verify the changes
print(df['estimated_arrival'].head())


0   2019-01-01 13:00:00
1   2019-01-02 04:00:00
2   2019-01-01 07:00:00
3   2019-01-01 20:00:00
4   2019-01-01 21:00:00
Name: estimated_arrival, dtype: datetime64[ns]


  df['estimated_arrival'] = df['estimated_arrival'].dt.floor('H')


In [27]:
df6

Unnamed: 0,RowIndex,truck_id,route_id,departure_date,estimated_arrival,delay,event_time
0,1,30312694,R-b236e347,2019-01-01 07:00:00,2019-01-01 13:00:00,0,2024-09-17
1,2,59856374,R-29ea762e,2019-01-01 07:00:00,2019-01-02 04:00:00,0,2024-09-17
2,3,12602955,R-a3d67783,2019-01-01 07:00:00,2019-01-01 07:00:00,0,2024-09-17
3,4,46619422,R-31ec9310,2019-01-01 07:00:00,2019-01-01 20:00:00,0,2024-09-17
4,5,10140178,R-a07c5dbd,2019-01-01 07:00:00,2019-01-01 21:00:00,0,2024-09-17
...,...,...,...,...,...,...,...
12303,12304,31047945,R-1484a7ea,2019-02-06 07:00:00,2019-02-13 22:00:00,0,2024-09-17
12304,12305,14758432,R-927cf900,2019-02-06 07:00:00,2019-02-13 18:00:00,1,2024-09-17
12305,12306,31370619,R-5a83ad98,2019-02-06 07:00:00,2019-02-14 08:00:00,1,2024-09-17
12306,12307,67332883,R-991530bc,2019-02-06 07:00:00,2019-02-13 11:00:00,1,2024-09-17


In [30]:
!pip install hopsworks




In [31]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1043610

Connected. Call `.close()` to terminate connection gracefully.


In [32]:
truck_schedule_fg = fs.get_or_create_feature_group(
    name="truck_schedule",
    version=1,
    description="truck_schedule",
    online_enabled=True,
    primary_key=['RowIndex'],
    event_time='event_time',
    
)

In [33]:
truck_schedule_fg.insert(df6)


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1043610/fs/1035337/fg/1204565


Uploading Dataframe: 0.00% |          | Rows 0/12308 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: truck_schedule_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/1043610/jobs/named/truck_schedule_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x24f54f43dd0>, None)

In [29]:
df6.to_csv('df6.csv', index=False)