In [2]:
import pandas as pd
import numpy as np
import datetime
import holidays
import pytz
from dateutil import parser

#csv reading
file_path = 'building_ctrl_cmd.csv'
df = pd.read_csv(file_path)

# Drop rows where the condition is met from the original DataFrame
df.drop(df[df['sourceid'] != 'HC-AI200-AL103'].index, inplace=True)


df = pd.DataFrame(df)
df['timestamp'] = pd.to_datetime(df['timestamp'] // 1e9, unit='s')
df['timestamp'] = pd.to_datetime(df['timestamp'])
def convert_to_finland(timestamp):
    utc_timezone = pytz.timezone('UTC')
    finland_timezone = pytz.timezone('Europe/Helsinki')
    utc_time = pd.to_datetime(timestamp, unit='s', origin='unix').tz_localize(utc_timezone)
    finland_time = utc_time.astimezone(finland_timezone)
    return finland_time

# Apply the conversion function to each timestamp in the DataFrame
df['timestampFinland'] = df['timestamp'].apply(convert_to_finland)
df = df[['timestamp','timestampFinland','value']]
df = df.sort_values(by='timestampFinland').reset_index()
df['date'] = df['timestampFinland'].dt.date
df.drop('index', axis=1, inplace=True)

In [3]:

# Assuming 'df' is your DataFrame and 'date' is the name of the column containing dates
start_date = df['date'][0]
end_date = df['date'].iloc[-1]

# Create a DataFrame with a date range
df_holiday = pd.DataFrame(pd.date_range(start_date, end_date), columns=["date"])

# Function to detect holidays and convert to boolean
def detect_holiday(desired_date, country_code="FI", sub_dev=None):
    # Initialize the holiday object for the given country
    holiday = holidays.CountryHoliday(country_code, subdiv=sub_dev)
    
    # Check if the desired date is a holiday
    return desired_date in holiday

# Apply the detect_holiday function to each date in the DataFrame and convert to boolean
df_holiday["holidayFinland"] = df_holiday["date"].apply(lambda x: detect_holiday(x))

# Filter the DataFrame to keep only rows where 'holidayFinland' is True
df_holiday = df_holiday[df_holiday['holidayFinland']]
# Create a new DataFrame without the 'holidayFinland' column
df_holiday = df_holiday.drop('holidayFinland', axis=1)


In [4]:
df['formattedTimestamp'] = df['timestampFinland'].dt.strftime("%Y-%m-%dT%H:00")
df['formattedTimestamp'] = pd.to_datetime(df['formattedTimestamp'])

df.rename(columns={'value': 'ctrlCmd'}, inplace=True)

df['hour_week'] = ((df['formattedTimestamp'].dt.dayofweek) * 24+24) - (24 - df['formattedTimestamp'].dt.hour)
df['isWeekend'] = df['hour_week'].between(113,167).astype(int)

# Convert 'date' columns to datetime objects with date component only
df_holiday['date'] = pd.to_datetime(df_holiday['date']).dt.date
df['date'] = pd.to_datetime(df['date']).dt.date

# Check if each date in 'date' column of df is in the 'date' column of df_holiday
# Convert the result to integer (True -> 1, False -> 0) and assign it to 'isHoliday' column
df['isHoliday'] = df['date'].isin(df_holiday['date']).astype(int)

df = df.drop_duplicates(subset='formattedTimestamp', keep='first')




In [5]:
file_path = 'weather_live.csv'
df_weather_live = pd.read_csv(file_path)
df_weather_live = pd.DataFrame(df_weather_live)

df_weather_live['timestamp'] = pd.to_datetime(df_weather_live['timestamp'] // 1e9, unit='s')
df_weather_live['timestamp'] = pd.to_datetime(df_weather_live['timestamp'])

# Apply the conversion function to each timestamp in the DataFrame
df_weather_live['timestampFinland'] = df_weather_live['timestamp'].apply(convert_to_finland)
df_weather_live['formattedTimestamp'] = df_weather_live['timestampFinland'].dt.strftime("%Y-%m-%dT%H:00")
df_weather_live['formattedTimestamp'] = pd.to_datetime(df_weather_live['formattedTimestamp'])
df_weather_live = df_weather_live[['timestamp','timestampFinland','formattedTimestamp','value']]
df_weather_live = df_weather_live.sort_values(by='timestampFinland').reset_index()
df_weather_live['date'] = df_weather_live['timestampFinland'].dt.date
df_weather_live.rename(columns={'value': 'liveTemp'}, inplace=True)
df_weather_live = df_weather_live.sort_values(by='formattedTimestamp')
df_weather_live = df_weather_live.drop_duplicates(subset='formattedTimestamp', keep='first')

In [6]:
file_path = 'weather_forecast.csv'
df_weather_forecast = pd.read_csv(file_path)
df_weather_forecast = pd.DataFrame(df_weather_forecast)

df_weather_forecast['timestamp'] = pd.to_datetime(df_weather_forecast['timestamp'] // 1e9, unit='s')
df_weather_forecast['timestamp'] = pd.to_datetime(df_weather_forecast['timestamp'])

# Apply the conversion function to each timestamp in the DataFrame
df_weather_forecast['timestampFinland'] = df_weather_forecast['timestamp'].apply(convert_to_finland)
df_weather_forecast['formattedTimestamp'] = df_weather_forecast['timestampFinland'].dt.strftime("%Y-%m-%dT%H:00")
df_weather_forecast['formattedTimestamp'] = pd.to_datetime(df_weather_forecast['formattedTimestamp'])

df_weather_forecast = df_weather_forecast[['formattedTimestamp','value']]
df_weather_forecast = df_weather_forecast.sort_values(by='formattedTimestamp').reset_index()
df_weather_forecast.rename(columns={'value': 'forecastTemp'}, inplace=True)
df_weather_forecast = df_weather_forecast.sort_values(by='formattedTimestamp')
df_weather_forecast = df_weather_forecast.drop_duplicates(subset='formattedTimestamp', keep='first')
df_weather_forecast['forecastTemp'] = df_weather_forecast['forecastTemp'].shift(-12)


In [7]:
df_weather_live = pd.merge_ordered(df_weather_live,df_weather_forecast,on='formattedTimestamp')
df_weather_live.dropna(inplace=True)
df_weather_live = df_weather_live[['formattedTimestamp','liveTemp','forecastTemp']]


In [8]:
df_weather_live['tempDiff'] = df_weather_live['liveTemp']-df_weather_live['forecastTemp']

In [9]:
df = df[['formattedTimestamp','ctrlCmd','hour_week','isWeekend']]

In [10]:
df = pd.merge_ordered(df,df_weather_live,on='formattedTimestamp')

In [11]:
df.dropna(inplace=True)

In [12]:
df = df[['formattedTimestamp','ctrlCmd','liveTemp','forecastTemp','isWeekend','tempDiff']]


In [13]:
df.loc[:, 'avg_liveTemp_next_three_hours'] = df['liveTemp'].rolling(window=3, min_periods=1).mean().shift(-3).round(2)
df.loc[:, 'avg_forecastTemp_next_three_hours'] = df['forecastTemp'].rolling(window=3, min_periods=1).mean().shift(-3).round(2)
df['avg_tempDiff'] = df['avg_liveTemp_next_three_hours']-df['avg_forecastTemp_next_three_hours']

In [31]:
x = df[['ctrlCmd','isWeekend','tempDiff','avg_tempDiff']]
x.loc[:, 'isWeekend'] += 1


In [32]:
x.reset_index(drop=True, inplace=True)
x.loc[:, 'ctrlCmd'] += 4
x.loc[x['isWeekend'] == 2, 'isWeekend'] *= 5
x.loc[:, 'tempDiff'] = x['tempDiff'].round(2)
x.loc[:, 'avg_tempDiff'] = x['avg_tempDiff'].round(2)

In [33]:
x['ctrlCmd'].unique()

array([5., 6., 7., 4., 3., 2., 1.])

In [34]:
x.to_csv('beedataML.csv', index=False)
