# Step 1: Loading train, test


In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from statsmodels.tsa.statespace.sarimax import SARIMAX
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Dropout
from keras.callbacks import EarlyStopping
import keras_tuner as kt

# Metrics calculation
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return {"MAE": mae, "RMSE": rmse, "MAPE": mape}

# Plot results 
def plot_results(y_true, y_pred, model_name, dates):
    plt.figure(figsize=(10, 6))
    plt.plot(dates, y_true, label='Actual', marker='o')
    plt.plot(dates, y_pred, label='Predicted', linestyle='--')
    plt.title(f'{model_name}: Actual vs Predicted')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.grid()
    plt.xticks(rotation=45)
    plt.show()

# XGBoost Model
def train_xgboost(X_train, y_train, X_test, y_test, dates_train, dates_test):
    params = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.05, 0.1]}
    model = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1)
    grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='neg_mean_squared_error', cv=3)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred_train = best_model.predict(X_train)
    y_pred_test = best_model.predict(X_test)
    
    metrics_train = calculate_metrics(y_train, y_pred_train)
    metrics_test = calculate_metrics(y_test, y_pred_test)
    
    plot_results(y_train, y_pred_train, "XGBoost (Train)", dates_train)
    plot_results(y_test, y_pred_test, "XGBoost (Test)", dates_test)
    
    return metrics_train, metrics_test, best_model

# Random Forest Model
def train_random_forest(X_train, y_train, X_test, y_test, dates_train, dates_test):
    params = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 15], 'min_samples_split': [2, 5, 10]}
    model = RandomForestRegressor(n_jobs=-1)
    grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='neg_mean_squared_error', cv=3)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred_train = best_model.predict(X_train)
    y_pred_test = best_model.predict(X_test)
    
    metrics_train = calculate_metrics(y_train, y_pred_train)
    metrics_test = calculate_metrics(y_test, y_pred_test)
    
    plot_results(y_train, y_pred_train, "Random Forest (Train)", dates_train)
    plot_results(y_test, y_pred_test, "Random Forest (Test)", dates_test)
    
    return metrics_train, metrics_test, best_model

# LightGBM Model
def train_lightgbm(X_train, y_train, X_test, y_test, dates_train, dates_test):
    params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1], 'num_leaves': [31, 50, 100], 'min_data_in_leaf': [20, 50, 100]}
    model = lgb.LGBMRegressor(n_jobs=-1)
    grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='neg_mean_squared_error', cv=3)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred_train = best_model.predict(X_train)
    y_pred_test = best_model.predict(X_test)
    
    metrics_train = calculate_metrics(y_train, y_pred_train)
    metrics_test = calculate_metrics(y_test, y_pred_test)
    
    plot_results(y_train, y_pred_train, "LightGBM (Train)", dates_train)
    plot_results(y_test, y_pred_test, "LightGBM (Test)", dates_test)
    
    return metrics_train, metrics_test, best_model

# Deep Learning Model with Two Hidden Layers
def build_deep_learning_model(hp):
    model = Sequential()
    model.add(Dense(hp.Int('units_1', min_value=32, max_value=128, step=32), activation='relu', input_dim=X_train.shape[1]))
    model.add(Dropout(0.2))
    model.add(Dense(hp.Int('units_2', min_value=32, max_value=128, step=32), activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer=hp.Choice('optimizer', values=['adam', 'rmsprop']), loss='mean_squared_error', metrics=['mae'])
    return model

def train_deep_learning(X_train, y_train, X_test, y_test, dates_train, dates_test):
    tuner = kt.Hyperband(build_deep_learning_model, objective='val_loss', max_epochs=10, factor=3, directory='kt_dir', project_name='deep_learning_tuning')
    tuner.search(X_train, y_train, epochs=50, validation_split=0.2, verbose=0)
    
    best_model = tuner.get_best_models(num_models=1)[0]
    best_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0, callbacks=[EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)])
    
    y_pred_train = best_model.predict(X_train).flatten()
    y_pred_test = best_model.predict(X_test).flatten()
    
    metrics_train = calculate_metrics(y_train, y_pred_train)
    metrics_test = calculate_metrics(y_test, y_pred_test)
    
    plot_results(y_train, y_pred_train, "Deep Learning (Train)", dates_train)
    plot_results(y_test, y_pred_test, "Deep Learning (Test)", dates_test)
    
    return metrics_train, metrics_test, best_model

def train_models(X_train, y_train, X_test, y_test, dates_train, dates_test, i):
    results_train = {"Series_Number": i, "Models": {}}
    results_test = {"Series_Number": i, "Models": {}}
    
    print("Training XGBoost...")
    xgb_train_metrics, xgb_test_metrics, _ = train_xgboost(X_train, y_train, X_test, y_test, dates_train, dates_test)
    results_train["Models"]["XGBoost"] = xgb_train_metrics
    results_test["Models"]["XGBoost"] = xgb_test_metrics
    
    print("Training Random Forest...")
    rf_train_metrics, rf_test_metrics, _ = train_random_forest(X_train, y_train, X_test, y_test, dates_train, dates_test)
    results_train["Models"]["Random Forest"] = rf_train_metrics
    results_test["Models"]["Random Forest"] = rf_test_metrics
    
    print("Training LightGBM...")
    lgbm_train_metrics, lgbm_test_metrics, _ = train_lightgbm(X_train, y_train, X_test, y_test, dates_train, dates_test)
    results_train["Models"]["LightGBM"] = lgbm_train_metrics
    results_test["Models"]["LightGBM"] = lgbm_test_metrics
    
    print("Training Deep Learning...")
    deep_learning_train_metrics, deep_learning_test_metrics, _ = train_deep_learning(X_train, y_train, X_test, y_test, dates_train, dates_test)
    results_train["Models"]["Deep Learning"] = deep_learning_train_metrics
    results_test["Models"]["Deep Learning"] = deep_learning_test_metrics
    
    # Convert to DataFrame for better visualization
    df_results_train = pd.DataFrame.from_dict(results_train["Models"], orient='index').reset_index()
    df_results_test = pd.DataFrame.from_dict(results_test["Models"], orient='index').reset_index()
    
    df_results_train["Series_Number"] = i
    df_results_test["Series_Number"] = i
    
    return df_results_train, df_results_test



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



## MAIN

In [157]:
import pandas as pd

# Lists to collect all results for concatenation
all_results_train = []  
all_results_test = []

for i in range(1, 129):
    # File paths
    train_path = f"C:\\Users\\Siwar\\Research_project\\First approach\\Datasets\\Data_ready_for_modelling\\Train\\serie_{i}train.csv"
    test_path = f"C:\\Users\\Siwar\\Research_project\\First approach\\Datasets\\Data_ready_for_modelling\\Test\\serie_{i}test.csv"
    
    # Load datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Prepare X and y
    X_train = train_data.drop(columns=['y'])
    X_test = test_data.drop(columns=['y'])
    y_train = train_data['y']
    y_test = test_data['y']
    
    # Train models and save results
    results_train, results_test = train_models(X_train, y_train, X_test, y_test, train_data.index, test_data.index, i)
    
    # Save individual results to CSV
    results_train.to_csv(f"C:\\Users\\Siwar\\Research_project\\First approach\\Datasets\\Results_metrics\\results_train_series_{i}.csv", index=False)
    results_test.to_csv(f"C:\\Users\\Siwar\\Research_project\\First approach\\Datasets\\Results_metrics\\results_test_series_{i}.csv", index=False)
    
    # Collect all results for concatenation
    all_results_train.append(results_train)
    all_results_test.append(results_test)

# Concatenate all the results into a single DataFrame
final_results_train = pd.concat(all_results_train, ignore_index=True)
final_results_test = pd.concat(all_results_test, ignore_index=True)

# Save the concatenated results
final_results_train.to_csv("C:\\Users\\Siwar\\Research_project\\First approach\\Datasets\\Results_metrics\\all_results_train.csv", index=False)
final_results_test.to_csv("C:\\Users\\Siwar\\Research_project\\First approach\\Datasets\\Results_metrics\\all_results_test.csv", index=False)


Selected Features: ['price_trend_sma96', 'NewForSaleAct.USA_lag97', 'IndGlobalCAN_lag96', 'NewForSaleSAAR.USA_lag96', 'IndGlobalQc_lag96', 'Price', 'IndHPI_SAAR.USA_lag96', 'IndHPI_Act.USA_lag96', 'ConstSpendRes.USA_lag96', 'ConstSpendNonRes.USA_lag96', 'US_PPI_Lumber_lag96', 'IndNAHB.USA_lag90', 'MnfdHomesSAAR.USA_lag91', 'ExistForSaleInv.USA_lag96', 'HsSglSAAR.QC_lag96', 'ExistMthSply.USA_lag97', 'BdgPmtSglSAAR.USA_lag96', 'BdgPmtTotSAAR.USA_lag90', 'BD.CDN.RRB.DQ.YLD_lag96', 'CDN.AVG.3YTO5Y.AVG_lag89', 'CDN.AVG.OVER.10.AVG_lag95', 'BD.CDN.LONG.DQ.YLD_lag95', 'Dollar.US.dollar.CDN_lag96', 'TauxUSD_CAD_lag96', 'ValeurExport.CHN_lag96', 'US_Lbr_Cons_lag62', 'Taxes_exp_lag95', 'InventaireSciageRes.ON_lag96', 'US_Lbr_Exp_lag97', 'NewSoldSAAR.USA_lag96', 'HsSglSAAR.USA_lag94', 'HsTotSAAR.USA_lag96', 'NewMthSply.USA_lag94', 'HsSglSAAR.CAN_lag96', 'ExistSoldSAAR.USA_lag96', 'HsSglSAAR.AB_lag96', 'BD.CDN.5YR.DQ.YLD_lag95', 'TotShip.ON_lag96', 'BD.CDN.10YR.DQ.YLD_lag95', 'ValeurExport.PaysExp