# Regression
[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/yggdrasil-decision-forests/blob/main/documentation/public/docs/tutorial/regression.ipynb)


## Setup

In [None]:
pip install ydf -U

## What is regression?


**Regression,** is the task of predicting a numerical value, such as a tally, a measure, or a quantity. For instance, predicting the age of an animal or the cost of a product are regression problems. By default, the output of a regression model is the expected value, that is, the value that minimizes the squared error.
Regression labels can be integers or float values.

## Training a regression model

The task of a model (e.g., classification, regression, ranking, uplifting) is determined by the learner argument `task`.


In [None]:
# Load libraries
import ydf  # Yggdrasil Decision Forests
import pandas as pd  # We use Pandas to load small datasets

# Download a classification dataset and load it as a Pandas DataFrame.
ds_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset"
all_ds = pd.read_csv(f"{ds_path}/abalone.csv")

# Randomly split the dataset into a training (70%) and testing (30%) dataset
all_ds = all_ds.sample(frac=1)
split_idx = len(all_ds) * 7 // 10
train_ds = all_ds.iloc[:split_idx]
test_ds = all_ds.iloc[split_idx:]

# Print the first 5 training examples
train_ds.head(5)

The label column is:

In [None]:
train_ds["Rings"]

We can train a regression model:

In [None]:
model = ydf.GradientBoostedTreesLearner(label="Rings",
                                task=ydf.Task.REGRESSION).train(train_ds)

Regression models are evaluated using RMSE (root mean square error).

In [None]:
evaluation = model.evaluate(test_ds)

print(evaluation)

You can plot a rich evaluation with more plots.

In [None]:
evaluation

Get Started

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:

df_csv = pd.read_csv("/content/drive/MyDrive/dataset/Building_X.csv")
df_csv["record_timestamp"] = pd.to_datetime(df_csv["record_timestamp"], format='%d/%m/%Y %H:%M')
df_csv["total_cooling_load"] = 4.19 * (df_csv["CHR-01-CHWFWR"] * abs(df_csv["CHR-01-CHWRWT"] - df_csv["CHR-01-CHWSWT"]) + df_csv["CHR-02-CHWFWR"] * abs(df_csv["CHR-02-CHWRWT"] - df_csv["CHR-02-CHWSWT"]) + df_csv["CHR-03-CHWFWR"] * abs(df_csv["CHR-03-CHWRWT"] - df_csv["CHR-03-CHWSWT"]))
df_csv["total_power_consumption"] = df_csv["CHR-01-KW"] + df_csv["CHR-02-KW"] + df_csv["CHR-03-KW"]
df_csv["how_many_chiller"] = (df_csv["CHR-01-KW"] > 0).astype(int) + 2*(df_csv["CHR-02-KW"] > 0).astype(int) + 4*(df_csv["CHR-03-KW"] > 0).astype(int)

temp_csv = pd.read_csv("/content/drive/MyDrive/dataset/air_temperature_merged_sampled_10T.csv")
temp_csv["record_timestamp"] = pd.to_datetime(temp_csv["date_time"], format='%Y-%m-%d %H:%M:%S')
temp_csv = temp_csv.drop("date_time", axis=1)

humi_csv = pd.read_csv('/content/drive/MyDrive/dataset/humidity_merged_sampled_linear_10T.csv')
humi_csv["record_timestamp"] = pd.to_datetime(humi_csv["date_time"], format='%Y-%m-%d %H:%M:%S')
humi_csv = humi_csv.drop(["date_time", "Unnamed: 0"], axis=1)
humi_csv.columns = humi_csv.columns.str.replace("Temperature", "Humidity")

In [None]:
all_ds = pd.DataFrame(pd.date_range(start='2023-01-01 00:00', end='2023-12-31 23:00', freq='h'), columns=['record_timestamp'])
all_ds = pd.merge(all_ds, df_csv[['record_timestamp', 'total_cooling_load']], how='left', on='record_timestamp')
all_ds = pd.merge(all_ds, temp_csv, how='left', on='record_timestamp')
all_ds = pd.merge(all_ds, humi_csv, how='left', on='record_timestamp')
# fig, ax = plt.subplots(figsize=(30,30))
# # sns.heatmap(new.corr(), annot=True, linewidths=.5, ax=ax)

all_ds["month"] = all_ds["record_timestamp"].dt.month
all_ds["hour"] = all_ds["record_timestamp"].dt.hour
all_ds["weekday"] = all_ds["record_timestamp"].dt.weekday.apply(lambda x: True if x <= 4 else False)
all_ds["weekend"] = all_ds["record_timestamp"].dt.weekday.apply(lambda x: True if x >= 5 else False)

holidays = pd.to_datetime(['2023-01-02', '2023-01-23','2023-01-24','2023-01-25','2023-04-05','2023-04-07','2023-04-08','2023-04-10',
                           '2023-05-01','2023-05-26','2023-06-22','2023-07-01','2023-09-30','2023-10-02','2023-10-23','2023-12-25','2023-12-26']).date
all_ds['is_holiday'] = all_ds['record_timestamp'].dt.date.isin(holidays)

all_ds = all_ds.drop("record_timestamp", axis=1)

all_ds = all_ds[all_ds['total_cooling_load'].notna()]

# all_ds = all_ds.drop()

In [None]:
from sklearn.model_selection import train_test_split

# splt_idx = len(all_ds) * 7 // 10
# train_ds = all_ds.iloc[:splt_idx]
# test_ds = all_ds.iloc[splt_idx:]

train_dataset, test_dataset = train_test_split(
    all_ds, test_size=0.3, random_state=1234
)

tuner = ydf.RandomSearchTuner(num_trials=50)
tuner.choice("shrinkage", [0.2, 0.1, 0.05])
tuner.choice("subsample", [1.0, 0.9, 0.8])
tuner.choice("max_depth", [3, 4, 5, 6])

train_dataset.head(5)

In [None]:
train_dataset["total_cooling_load"]

In [None]:
model = ydf.GradientBoostedTreesLearner(label="total_cooling_load",
                                        task=ydf.Task.REGRESSION,
                                        num_trees=1000,
                                        tuner=tuner
                                      ).train(train_dataset)

In [None]:
model.analyze_prediction(test_dataset.iloc[:1])

In [None]:
evaluation = model.evaluate(test_dataset)
evaluation

In [None]:
predict_ds = pd.DataFrame(pd.date_range(start='2024-01-01 00:00', end='2024-01-30 23:00', freq='h'), columns=['record_timestamp'])
predict_ds = pd.merge(predict_ds, temp_csv, how='left', on='record_timestamp')
predict_ds = pd.merge(predict_ds, humi_csv, how='left', on='record_timestamp')

predict_ds["month"] = predict_ds["record_timestamp"].dt.month
predict_ds["hour"] = predict_ds["record_timestamp"].dt.hour
predict_ds["weekday"] = predict_ds["record_timestamp"].dt.weekday.apply(lambda x: True if x <= 4 else False)
predict_ds["weekend"] = predict_ds["record_timestamp"].dt.weekday.apply(lambda x: True if x >= 5 else False)

holidays = pd.to_datetime(['2024-01-01']).date
predict_ds['is_holiday'] = predict_ds['record_timestamp'].dt.date.isin(holidays)

predict_ds

In [None]:
predict_ds = predict_ds.drop("record_timestamp", axis=1)

predict_ds["total_cooling_load"] = 0

predict_ds.head(5)

In [None]:
result = pd.DataFrame(model.predict(predict_ds))

In [None]:
result.to_csv("/content/drive/MyDrive/dataset/predict02.csv")