# Step 1: Loading train, test


In [None]:
import pandas as pd
train= pd.read_csv(r"C:\Users\Siwar\Research_project\Second approach\Datasets\3_Data_ready_for_modelling\train.csv", index_col=0)
train_v = train.sort_index()
# Determine train-test split size
train_size = int(0.94 * len(train_v))
train_v.index = pd.to_datetime(train_v.index)

# Split into train and test sets
validation = train_v.iloc[train_size:]
validation = validation.sort_values(by=['Series', 'Date'])

validation.to_csv(r"C:\Users\Siwar\Research_project\Second approach\Datasets\3_Data_ready_for_modelling\validation_ready.csv")
validation 

In [None]:
train = train_v.iloc[:train_size]
train = train.sort_values(by=['Series', 'Date'])

train.to_csv(r"C:\Users\Siwar\Research_project\Second approach\Datasets\3_Data_ready_for_modelling\train_ready_after_vald.csv")
train 

In [None]:
test= pd.read_csv(r"C:\Users\Siwar\Research_project\Second approach\Datasets\2_Train_test_second_approach\test.csv", index_col=0)
test

In [None]:
train.columns = train.columns.str.replace('.', '_')
validation.columns = validation.columns.str.replace('.', '_')
test.columns = test.columns.str.replace('.', '_')

In [None]:
train['Date']= train.index
train['Date'] = pd.to_datetime(train['Date'])


validation['Date']= validation.index
validation['Date'] = pd.to_datetime(validation['Date'])


test['Date']= test.index
test['Date'] = pd.to_datetime(test['Date'])

In [None]:
test= test.reset_index(drop=True)
train= train.reset_index(drop=True)
validation= validation.reset_index(drop=True)

# Model

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Flatten, Concatenate
from kerastuner.tuners import RandomSearch

target_col = "y"

static_features = ["Market", "Length", "Grade_Type", "Width", "Height"]  # Static features
lagged_price_features = [f'price_lag_diff{i}' for i in [4, 12, 24, 48, 72, 96]] + \
                        [f'price_trend_sma{i}' for i in [4, 12, 24, 48, 72, 96]]
external_features = [col for col in train.columns if col.endswith("_lagged")]  # External factors
time_features = lagged_price_features + ["Price"]  # time-dependent features

num_products = train["Series"].nunique()  

# create sequences for LSTM model
def create_sequences(df, time_steps=4):
    X_seq, X_prod, X_prod_features, X_ext, y = [], [], [], [], []
    
    for product_id in df["Series"].unique():
        product_df = df[df["Series"] == product_id].reset_index(drop=True)
        
        for i in range(len(product_df) - time_steps):  
            X_seq.append(product_df.loc[i:i+time_steps-1, time_features].values)  # Past values seq
            X_ext.append(product_df.loc[i+time_steps-1, external_features].values)  # External factors at t
            X_prod.append(product_df.loc[i+time_steps-1, "Series"])  # Product ID
            X_prod_features.append(product_df.loc[i+time_steps-1, static_features].values)  # Static product features
            y.append(product_df.loc[i+time_steps-1, target_col])  # Target value

    return np.array(X_seq), np.array(X_ext), np.array(X_prod), np.array(X_prod_features), np.array(y)

X_seq_train, X_ext_train, X_prod_train, X_prod_features_train, y_train = create_sequences(train)
X_seq_val, X_ext_val, X_prod_val, X_prod_features_val, y_val = create_sequences(validation)
X_seq_test, X_ext_test, X_prod_test, X_prod_features_test, y_test = create_sequences(test)

def build_lstm_model(hp):
    time_input = Input(shape=(X_seq_train.shape[1], X_seq_train.shape[2]), name="time_input")
    external_input = Input(shape=(X_ext_train.shape[1],), name="external_input")
    product_input = Input(shape=(1,), name="product_input")
    product_features_input = Input(shape=(X_prod_features_train.shape[1],), name="product_features_input")

    lstm_units = hp.Int("lstm_units", min_value=32, max_value=128, step=32)
    embedding_dim = hp.Int("embedding_dim", min_value=5, max_value=20, step=5)
    
    lstm_output = LSTM(lstm_units, return_sequences=False)(time_input)
    
    product_embedding = Embedding(input_dim=num_products, output_dim=embedding_dim)(product_input)
    product_embedding = Flatten()(product_embedding)

    merged = Concatenate()([lstm_output, external_input, product_embedding, product_features_input])
    
    dense_units = hp.Int("dense_units", min_value=32, max_value=128, step=32)
    output = Dense(dense_units, activation="relu")(merged)
    output = Dense(1)(output)
    
    model = Model(inputs=[time_input, external_input, product_input, product_features_input], outputs=output)
    model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(hp.Choice("learning_rate", [1e-3, 1e-4])))
    
    return model

# Hyperparameter tuning
tuner = RandomSearch(
    build_lstm_model,
    objective="val_loss",
    max_trials=10,
    executions_per_trial=1,
    directory="lstm_tuning",
    project_name="price_forecasting"
)

tuner.search(
    [X_seq_train, X_ext_train, X_prod_train, X_prod_features_train], y_train,
    validation_data=([X_seq_val, X_ext_val, X_prod_val, X_prod_features_val], y_val),
    epochs=20, batch_size=32
)

# Get best model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)

# Train final model
model.fit(
    [X_seq_train, X_ext_train, X_prod_train, X_prod_features_train], y_train,
    validation_data=([X_seq_val, X_ext_val, X_prod_val, X_prod_features_val], y_val),
    epochs=50, batch_size=32
)

from sklearn.metrics import mean_absolute_error, mean_squared_error

# Predict on test set
y_pred = model.predict([X_seq_test, X_ext_test, X_prod_test, X_prod_features_test])

# metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# results
print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"Test MAPE: {mape:.2f}%")
