In [21]:
import pandas as pd
from cyclic_boosting.pipelines import pipeline_CBPoissonRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

DATA_PATH = "processed_data.csv"
TARGET_COLUMN = "Invoice Quantity"
CATEGORICAL_FEATURES = ["City", "Day", "Month", "Weekday", "Year"]

data = pd.read_csv(DATA_PATH)

X = data.drop(columns=[TARGET_COLUMN])
y = data[TARGET_COLUMN]

encoder = OrdinalEncoder()
X[CATEGORICAL_FEATURES] = encoder.fit_transform(X[CATEGORICAL_FEATURES])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = pipeline_CBPoissonRegressor()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

print("First 5 predictions:", predictions[:5])

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE on test data:", rmse)

mae = mean_absolute_error(y_test, predictions)
print("MAE (forecast error) on test data:", mae)

r2 = r2_score(y_test, predictions)
print("R^2 on test data:", r2)

First 5 predictions: [ 5.13981081 16.38702779  5.77062828 50.52225002 12.40310958]
RMSE on test data: 25.17453509363759
MAE (forecast error) on test data: 13.608945878576469
R^2 on test data: 0.1779225895733586


## Now on PCA Data

In [None]:
import pandas as pd
from cyclic_boosting.pipelines import pipeline_CBLocationRegressor
from cyclic_boosting import flags
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

DATA_PATH = "pca_data.csv"
TARGET_COLUMN = "PC1"
CATEGORICAL_FEATURES = ["City"]

data = pd.read_csv(DATA_PATH)

X = data.drop(columns=[TARGET_COLUMN])
y = data[TARGET_COLUMN]

for col in CATEGORICAL_FEATURES:
    if col in X.columns:
        category_mapping = {cat: i for i, cat in enumerate(X[col].unique())}
        X[col] = X[col].map(category_mapping)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

CB_features = {}
for col in X.columns:
    if col in CATEGORICAL_FEATURES:
        CB_features[col] = flags.IS_UNORDERED
    else:  
        CB_features[col] = flags.IS_CONTINUOUS

model = pipeline_CBLocationRegressor(feature_properties=CB_features)

model.fit(X_train, y_train)

predictions = model.predict(X_test)

print("First 5 predictions:", predictions[:5])
print("First 5 actual values:", y_test.iloc[:5].values)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE on test data:", rmse)

r2 = r2_score(y_test, predictions)
print("R^2 on test data:", r2)

mae = mean_absolute_error(y_test, predictions)
print("MAE on test data:", mae)

First 5 predictions: [-2.20940304  1.18976492  2.04112583  1.31006032  3.48828839]
First 5 actual values: [-1.52394449  0.66334116  2.31102806  5.88201141  3.01592176]
RMSE on test data: 0.9978927257803214
R^2 on test data: 0.8203579713900189
MAE on test data: 0.7189452054625557


## Try on Sequential Data Monthly Aggregated

In [10]:
import numpy as np
import pandas as pd
import pickle
import os
from cyclic_boosting.pipelines import pipeline_CBLocationRegressor
from cyclic_boosting import flags
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

data_path = "../sequential_data"

X_train = np.load(os.path.join(data_path, "X_train.npy"))
y_train = np.load(os.path.join(data_path, "y_train.npy"))
X_test = np.load(os.path.join(data_path, "X_test.npy"))
y_test = np.load(os.path.join(data_path, "y_test.npy"))

with open(os.path.join(data_path, "feature_names.pkl"), "rb") as f:
    feature_names = pickle.load(f)

with open(os.path.join(data_path, "city_info.pkl"), "rb") as f:
    city_info = pickle.load(f)

X_train_2d = X_train[:, -1, :]
X_test_2d = X_test[:, -1, :]

X_train_df = pd.DataFrame(X_train_2d, columns=feature_names)
X_test_df = pd.DataFrame(X_test_2d, columns=feature_names)

CATEGORICAL_FEATURES = ["city"]

label_encoders = {}
for col in CATEGORICAL_FEATURES:
    if col in X_train_df.columns:
        le = LabelEncoder()
        X_train_df[col] = le.fit_transform(X_train_df[col].astype(str))
        X_test_df[col] = le.transform(X_test_df[col].astype(str))
        label_encoders[col] = le

CB_features = {}
for col in X_train_df.columns:
    if col in CATEGORICAL_FEATURES:
        CB_features[col] = flags.IS_UNORDERED
    else:
        CB_features[col] = flags.IS_CONTINUOUS

model = pipeline_CBLocationRegressor(feature_properties=CB_features)
model.fit(X_train_df, y_train)

predictions = model.predict(X_test_df)

print("First 5 predictions:", predictions[:5])
print("First 5 actual values:", y_test[:5])

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE on test data:", rmse)

r2 = r2_score(y_test, predictions)
print("R^2 on test data:", r2)

mae = mean_absolute_error(y_test, predictions)
print("MAE on test data:", mae)


First 5 predictions: [0.02310549 0.02603255 0.01519481 0.02578645 0.03062276]
First 5 actual values: [0.00138344 0.00345861 0.00161402 0.00069172 0.04611483]
RMSE on test data: 0.07637697872851641
R^2 on test data: -0.031038627982431954
MAE on test data: 0.047063352389125755
