In [34]:
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import mean_squared_log_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [35]:
def train_catboost(X_train, y_train, X_val, y_val, params, cat_features=None):
    """
    Train a CatBoostRegressor model and predict on validation data.

    Parameters:
        X_train (array-like or DataFrame): Training features.
        y_train (array-like or Series): Training target values.
        X_val (array-like or DataFrame): Validation features.
        y_val (array-like or Series): Validation target values (used for shape checking).
        params (dict): Parameters for CatBoostRegressor.

    Returns:
        np.ndarray: Predictions for X_val.

    Raises:
        ValueError: If input data dimensions do not match.
        Exception: If model training or prediction fails.
    """
    try:
        # Basic input validation
        if len(X_train) != len(y_train):
            raise ValueError("X_train and y_train must have the same number of samples.")
        if len(X_val) != len(y_val):
            raise ValueError("X_val and y_val must have the same number of samples.")
        model = cb.CatBoostRegressor(**params)
        model.fit(X_train, y_train, cat_features=cat_features, verbose=0)
        preds = model.predict(X_val)

        joblib.dump(model, "catboost_model.pkl")

        return preds
    
    except Exception as e:
        print(f" Error in train_catboost: {e}")
        raise


def train_lstm(X_train, y_train, X_val, y_val, input_shape, params):
    model = Sequential([
        LSTM(params['lstm_units'], return_sequences=True, input_shape=input_shape),
        Dropout(params['dropout_rate']),
        LSTM(params['lstm_units'] // 2, return_sequences=False),
        Dropout(params['dropout_rate']),
        Dense(params['dense_units'], activation='relu'),
        Dense(1)
    ])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=params['learning_rate']), loss='mse')
    
   
    
    model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=15,  # Reduce during tuning
              batch_size=params['batch_size'],
              verbose=0,
             )
    
    pred = model.predict(X_val, verbose=0)
    
    tf.keras.backend.clear_session()  # better after training to fully release resources
    
    return pred.flatten(), model

In [25]:
X_train = pd.read_csv('X_train_catboost.csv')
y_train = pd.read_csv('y_train_catboost.csv')
X_val = pd.read_csv('X_val_catboost.csv')
y_val = pd.read_csv('y_val_catboost.csv')  

In [26]:
X_train

Unnamed: 0,shop_id,item_id,lag_1,lag_2,lag_3,lag_6,lag_12,month,year,quarter,is_month_start,is_month_end,season,rolling_mean_3,trend_1_2,lag_1_ratio_2
0,101,100482,21432.0,10716.0,10716.0,21432.0,10716.0,10,2014,4,0,1,3,14288.000000,10716.0,1.999813
1,101,100482,10716.0,21432.0,10716.0,10716.0,10716.0,2,2015,1,0,1,0,14288.000000,-10716.0,0.499977
2,101,100482,42864.0,10716.0,21432.0,21432.0,21432.0,3,2015,1,0,1,1,25004.000000,32148.0,3.999627
3,101,100482,10716.0,42864.0,10716.0,10716.0,10716.0,7,2015,3,0,1,2,21432.000000,-32148.0,0.249994
4,101,100482,10716.0,10716.0,42864.0,10716.0,21432.0,8,2015,3,0,1,2,21432.000000,0.0,0.999907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51014,151,105821,2108.0,4216.0,6324.0,12648.0,3892.0,10,2014,4,0,1,3,4216.000000,-2108.0,0.499881
51015,151,105821,10380.0,2108.0,4216.0,8432.0,8432.0,11,2014,4,0,1,3,5568.000000,8272.0,4.921764
51016,151,105821,12975.0,10380.0,2108.0,2108.0,4216.0,12,2014,4,0,1,0,8487.666667,2595.0,1.249880
51017,151,105821,15570.0,12975.0,10380.0,6324.0,4216.0,1,2015,1,0,1,0,12975.000000,2595.0,1.199908


In [27]:
cat_features = ['shop_id', 'item_id', 'month', 'year', 'quarter', 'season', 'is_month_end', 'is_month_start']

In [28]:
best_params = {
    'iterations': 383,
    'learning_rate': 0.08225703868286412,
    'depth': 5,
    'l2_leaf_reg': 9.944359434041193,
    'border_count': 69,
    'min_data_in_leaf': 54,
    'bagging_temperature': 0.5419965715599584,
    'random_strength': 0.17570423922635958,
    'verbose': False,
    'random_seed': 42  
}


In [29]:
def log_transform(y):
    """Apply logarithmic transformation to the target variable.
     log(0) is undefined; adding 1 handles zero targets gracefully
    """
    return np.log1p(y)

In [30]:
def inverse_log_transform(y):
    """
    Inverse the logarithmic transformation applied to the target variable.
    Converts the predictions back to the original scale."""
    return np.expm1(y)

In [36]:
import catboost as cb
y_train_log = log_transform(y_train)
y_val_log = log_transform(y_val)
model_preds_log = train_catboost(X_train, y_train_log, X_val, y_val_log, best_params, cat_features)
preds = inverse_log_transform(model_preds_log)

In [32]:
preds

array([7710.63308696, 7235.56450653, 5311.57453287, ..., 1239.16785576,
       1171.40664465, 1003.57445215])

In [33]:
from sklearn.metrics import mean_squared_log_error
import numpy as np

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Assuming y_val and preds are your true and predicted values (both on original scale)
score = rmsle(y_val, preds)
print(f"Validation RMSLE: {score:.4f}")


Validation RMSLE: 0.6040


## LSTM

In [20]:
X_train_lstm = np.load('X_train_lstm.npy')
y_train_lstm = np.load('y_train_lstm.npy')
X_val_lstm = np.load('X_val_lstm.npy')
y_val_lstm = np.load('y_val_lstm.npy')

In [22]:
X_train_lstm.shape

(89765, 12, 6)

In [10]:
best_params = {
    'lstm_units': 116,
    'dense_units': 59,
    'dropout_rate': 0.1472,
    'learning_rate': 0.0048,
    'batch_size': 16
}
input_shape = (X_train_lstm.shape[1], X_train_lstm.shape[2])
input_shape

(12, 6)

In [15]:
pred_log, model = train_lstm(
    X_train=X_train_lstm,
    y_train=y_train_lstm,
    X_val=X_val_lstm,
    y_val=y_val_lstm,
    input_shape=input_shape,
    params=best_params
)

In [16]:
from numpy import expm1
pred_original = expm1(pred_log)

In [19]:
model.save("/Users/seydaaybar/Desktop/ntt_data/models/lstm_model.h5")



In [17]:
from sklearn.metrics import mean_squared_error
from numpy import sqrt

rmse = sqrt(mean_squared_error(np.expm1(y_val_lstm), pred_original))
print("RMSE on validation set:", rmse)


RMSE on validation set: 4.130954848532821
