In [862]:
import pandas as pd
import plotly.graph_objects as go
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import optuna
import numpy as np

In [863]:
def create_window(df,values_col,timeax,window_size=3):
    features=[]
    values=df[values_col].values
    
    for i in range(len(values)):
        
        features.append(values[i:i+window_size+1])
    dataset=pd.DataFrame(features)
    dataset=dataset.dropna()
    
    
    dataset=dataset.rename(columns={i:f'X_{i}' for i in dataset.columns[:-1]})
    dataset=dataset.rename(columns={dataset.columns[-1]:'y'})
    X=dataset[[i for i in dataset.columns if 'X' in i]]
    y=dataset['y']
    xaxis=df.loc[df.index[window_size:],timeax]
    return X,y,xaxis

In [864]:
data=pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv")
#data=pd.read_csv("https://raw.githubusercontent.com/Subikshaa/Time-Series-Forecasting-on-Web-Traffic-Dataset/master/Preprocessed_data/final_data.csv")

In [865]:
timeax='Month'
values_col='Passengers'
fig=go.Figure()
fig.add_trace(go.Scatter(x=data[timeax],y=data[values_col],mode='lines'))
fig.update_xaxes(title='date')
fig.update_yaxes(title='Passengers')
fig.show()

In [866]:
#data['Passengers']=(data['Passengers']-data['Passengers'].mean())/data['Passengers'].std()
#minimum=data[values_col].min()
#maximum=data[values_col].max()
#range_data=maximum-minimum
#data[values_col]=(data[values_col]-minimum)/(range_data)


In [867]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(data.index.values.reshape(-1, 1), data[values_col].values)
m=reg.coef_
b=reg.intercept_

In [868]:
trend=[m[0]*i+b for i in data.index.values]

In [869]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=data[timeax],y=data[values_col],mode='lines',name='data'))
fig.add_trace(go.Scatter(x=data[timeax],y=trend,mode='lines',name='Trend'))
fig.update_xaxes(title='date')
fig.update_yaxes(title='Passengers')
fig.show()

In [870]:
data[values_col]=data[values_col]-trend

In [871]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=data[timeax],y=data[values_col],mode='lines',name='data'))
fig.update_xaxes(title='date')
fig.update_yaxes(title='Passengers')
fig.update_layout(title='Estacionaria')
fig.show()

In [872]:
window_size=4

In [873]:

train=data[:int(0.8*data.shape[0])]
test=data[int(0.8*data.shape[0])-window_size:]

In [874]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=train[timeax],y=train[values_col],mode='lines',name='Train'))
fig.add_trace(go.Scatter(x=test[timeax],y=test[values_col],mode='lines',name='Test'))
fig.update_xaxes(title='date')
fig.update_yaxes(title='Passengers')
fig.update_layout(title='stationary')
fig.show()

In [875]:
X_train,y_train,train_xaxis=create_window(train,values_col,timeax,window_size=window_size)
X_test,y_test,test_xaxis=create_window(test,values_col,timeax,window_size=window_size)

In [876]:
model=XGBRegressor()

In [877]:
model.fit(X_train,y_train.values)

In [878]:
preds=model.predict(X_test)

In [880]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=train_xaxis,y=y_train,mode='lines',name='Train'))
fig.add_trace(go.Scatter(x=test_xaxis,y=y_test,mode='lines',name='Test'))
fig.add_trace(go.Scatter(x=test_xaxis,y=preds,mode='lines',name='Pred'))
fig.update_xaxes(title='date')
fig.update_yaxes(title='Passengers')
fig.show()

In [812]:
np.sqrt(mean_squared_error(y_test.values, preds))

91.43421855242369

In [881]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = XGBRegressor(**params)
    model.fit(X_train, y_train.values, verbose=False)
    predictions = model.predict(X_test)
    rmse =np.sqrt( mean_squared_error(y_test.values, predictions))
    return rmse

In [882]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)

[I 2024-08-02 20:17:16,447] A new study created in memory with name: no-name-d658fc16-709d-4b96-9336-ad869eec707d
[I 2024-08-02 20:17:16,580] Trial 0 finished with value: 68.28605691264924 and parameters: {'n_estimators': 146, 'learning_rate': 0.0015213388597225459, 'max_depth': 5, 'subsample': 0.8125076924968035, 'colsample_bytree': 0.2869422699643352, 'min_child_weight': 9}. Best is trial 0 with value: 68.28605691264924.
[I 2024-08-02 20:17:16,672] Trial 1 finished with value: 57.941698834684715 and parameters: {'n_estimators': 269, 'learning_rate': 0.004731438436933944, 'max_depth': 7, 'subsample': 0.8402732765040626, 'colsample_bytree': 0.5280103744829445, 'min_child_weight': 20}. Best is trial 1 with value: 57.941698834684715.
[I 2024-08-02 20:17:16,745] Trial 2 finished with value: 53.61723806972625 and parameters: {'n_estimators': 253, 'learning_rate': 0.02035017225233822, 'max_depth': 9, 'subsample': 0.29486676133063877, 'colsample_bytree': 0.10746892518552788, 'min_child_weigh

[I 2024-08-02 20:17:22,112] Trial 26 finished with value: 36.86766952225442 and parameters: {'n_estimators': 487, 'learning_rate': 0.03547586437119162, 'max_depth': 2, 'subsample': 0.45553100289305726, 'colsample_bytree': 0.5000646307047825, 'min_child_weight': 1}. Best is trial 26 with value: 36.86766952225442.
[I 2024-08-02 20:17:22,245] Trial 27 finished with value: 38.966744481632524 and parameters: {'n_estimators': 436, 'learning_rate': 0.030525441907376827, 'max_depth': 2, 'subsample': 0.4771882392064973, 'colsample_bytree': 0.48483115471379806, 'min_child_weight': 1}. Best is trial 26 with value: 36.86766952225442.
[I 2024-08-02 20:17:22,378] Trial 28 finished with value: 35.52194059230062 and parameters: {'n_estimators': 471, 'learning_rate': 0.034442022196633, 'max_depth': 2, 'subsample': 0.3900921443939027, 'colsample_bytree': 0.6101856018557161, 'min_child_weight': 1}. Best is trial 28 with value: 35.52194059230062.
[I 2024-08-02 20:17:22,514] Trial 29 finished with value: 6

[I 2024-08-02 20:17:25,843] Trial 53 finished with value: 38.18255708881059 and parameters: {'n_estimators': 261, 'learning_rate': 0.06664426249361032, 'max_depth': 2, 'subsample': 0.42702914889522897, 'colsample_bytree': 0.3728445975091924, 'min_child_weight': 3}. Best is trial 28 with value: 35.52194059230062.
[I 2024-08-02 20:17:26,008] Trial 54 finished with value: 41.259614197420795 and parameters: {'n_estimators': 346, 'learning_rate': 0.028584294036851594, 'max_depth': 4, 'subsample': 0.3859073103325707, 'colsample_bytree': 0.31583573452608954, 'min_child_weight': 4}. Best is trial 28 with value: 35.52194059230062.
[I 2024-08-02 20:17:26,153] Trial 55 finished with value: 36.62311001473269 and parameters: {'n_estimators': 484, 'learning_rate': 0.04280956403959693, 'max_depth': 1, 'subsample': 0.33883529821531044, 'colsample_bytree': 0.06467145614078523, 'min_child_weight': 1}. Best is trial 28 with value: 35.52194059230062.
[I 2024-08-02 20:17:26,258] Trial 56 finished with valu

[I 2024-08-02 20:17:31,126] Trial 80 finished with value: 39.049648064528604 and parameters: {'n_estimators': 625, 'learning_rate': 0.06241855109741182, 'max_depth': 6, 'subsample': 0.4095609153989469, 'colsample_bytree': 0.5800455536493776, 'min_child_weight': 4}. Best is trial 28 with value: 35.52194059230062.
[I 2024-08-02 20:17:31,298] Trial 81 finished with value: 34.57619496799906 and parameters: {'n_estimators': 543, 'learning_rate': 0.05298245329017633, 'max_depth': 1, 'subsample': 0.3557370924839977, 'colsample_bytree': 0.5384095085909312, 'min_child_weight': 1}. Best is trial 81 with value: 34.57619496799906.
[I 2024-08-02 20:17:31,673] Trial 82 finished with value: 36.918625979960716 and parameters: {'n_estimators': 548, 'learning_rate': 0.049975610749799865, 'max_depth': 2, 'subsample': 0.4316119699961935, 'colsample_bytree': 0.4981649546707929, 'min_child_weight': 2}. Best is trial 81 with value: 34.57619496799906.
[I 2024-08-02 20:17:31,894] Trial 83 finished with value: 

[I 2024-08-02 20:17:37,952] Trial 107 finished with value: 34.317726861774034 and parameters: {'n_estimators': 864, 'learning_rate': 0.031937404823304774, 'max_depth': 1, 'subsample': 0.33831870422704424, 'colsample_bytree': 0.6474524978153358, 'min_child_weight': 1}. Best is trial 103 with value: 33.909045668383065.
[I 2024-08-02 20:17:38,706] Trial 108 finished with value: 34.30996479425016 and parameters: {'n_estimators': 860, 'learning_rate': 0.04071534491672725, 'max_depth': 1, 'subsample': 0.3335699197948138, 'colsample_bytree': 0.6592905037206657, 'min_child_weight': 1}. Best is trial 103 with value: 33.909045668383065.
[I 2024-08-02 20:17:38,892] Trial 109 finished with value: 34.445949316891365 and parameters: {'n_estimators': 859, 'learning_rate': 0.03107058673303487, 'max_depth': 1, 'subsample': 0.3390861091269543, 'colsample_bytree': 0.6983893756616544, 'min_child_weight': 1}. Best is trial 103 with value: 33.909045668383065.
[I 2024-08-02 20:17:39,093] Trial 110 finished w

[I 2024-08-02 20:17:46,514] Trial 133 finished with value: 34.211366952015666 and parameters: {'n_estimators': 936, 'learning_rate': 0.049293163783790866, 'max_depth': 1, 'subsample': 0.3530448366991825, 'colsample_bytree': 0.6670445671252726, 'min_child_weight': 1}. Best is trial 116 with value: 33.475430349158145.
[I 2024-08-02 20:17:46,718] Trial 134 finished with value: 60.161400404014366 and parameters: {'n_estimators': 939, 'learning_rate': 0.04684308945964265, 'max_depth': 1, 'subsample': 0.26807198284812034, 'colsample_bytree': 0.6632984086136532, 'min_child_weight': 19}. Best is trial 116 with value: 33.475430349158145.
[I 2024-08-02 20:17:46,949] Trial 135 finished with value: 34.67588635875412 and parameters: {'n_estimators': 913, 'learning_rate': 0.03359816582343233, 'max_depth': 1, 'subsample': 0.3704034528222835, 'colsample_bytree': 0.6263089234895176, 'min_child_weight': 2}. Best is trial 116 with value: 33.475430349158145.
[I 2024-08-02 20:17:47,805] Trial 136 finished 

[I 2024-08-02 20:17:53,895] Trial 159 finished with value: 34.349941865198126 and parameters: {'n_estimators': 962, 'learning_rate': 0.07970501639816878, 'max_depth': 2, 'subsample': 0.41880044075094963, 'colsample_bytree': 0.5643440195168457, 'min_child_weight': 2}. Best is trial 116 with value: 33.475430349158145.
[I 2024-08-02 20:17:54,136] Trial 160 finished with value: 34.399818924266384 and parameters: {'n_estimators': 921, 'learning_rate': 0.07211777379326804, 'max_depth': 2, 'subsample': 0.39928920013157815, 'colsample_bytree': 0.5887913872596154, 'min_child_weight': 1}. Best is trial 116 with value: 33.475430349158145.
[I 2024-08-02 20:17:54,330] Trial 161 finished with value: 34.67741364492685 and parameters: {'n_estimators': 888, 'learning_rate': 0.0646464938514283, 'max_depth': 1, 'subsample': 0.37336309160994596, 'colsample_bytree': 0.6232318563194492, 'min_child_weight': 1}. Best is trial 116 with value: 33.475430349158145.
[I 2024-08-02 20:17:54,534] Trial 162 finished w

[I 2024-08-02 20:18:00,281] Trial 185 finished with value: 33.979692817203535 and parameters: {'n_estimators': 1000, 'learning_rate': 0.03649637543349554, 'max_depth': 1, 'subsample': 0.5121030066012181, 'colsample_bytree': 0.5504484551728208, 'min_child_weight': 1}. Best is trial 167 with value: 33.21172930109081.
[I 2024-08-02 20:18:00,572] Trial 186 finished with value: 38.399277187046295 and parameters: {'n_estimators': 997, 'learning_rate': 0.008044836740226725, 'max_depth': 1, 'subsample': 0.4987424449446483, 'colsample_bytree': 0.5543958848027477, 'min_child_weight': 1}. Best is trial 167 with value: 33.21172930109081.
[I 2024-08-02 20:18:01,623] Trial 187 finished with value: 40.89090567466429 and parameters: {'n_estimators': 1000, 'learning_rate': 0.037301179809199246, 'max_depth': 6, 'subsample': 0.5098676147973837, 'colsample_bytree': 0.5944896065791606, 'min_child_weight': 2}. Best is trial 167 with value: 33.21172930109081.
[I 2024-08-02 20:18:01,889] Trial 188 finished wi

In [883]:
final_model=XGBRegressor(**study.best_params)

In [884]:
final_model.fit(X_train, y_train.values, verbose=False)
tuned_preds = final_model.predict(X_test)

In [885]:
np.sqrt(mean_squared_error(y_test.values, tuned_preds))

33.21172930109081

In [886]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=train_xaxis,y=y_train,mode='lines',name='Train'))
fig.add_trace(go.Scatter(x=test_xaxis,y=y_test,mode='lines',name='Test'))
fig.add_trace(go.Scatter(x=test_xaxis,y=tuned_preds,mode='lines',name='Pred'))
fig.update_xaxes(title='date')
fig.update_yaxes(title='Passengers')

fig.show()

In [887]:
g1=np.array(list(y_train)+list(y_test))+np.array(trend[window_size:])
g2=np.array(list(y_train)+list(tuned_preds))+np.array(trend[window_size:])
xaxistime=train_xaxis.to_list()+test_xaxis.to_list()
fig=go.Figure()
fig.add_trace(go.Scatter(x=xaxistime,y=g2,mode='lines',name='Preds'))
fig.add_trace(go.Scatter(x=xaxistime,y=g1,mode='lines',name='Real'))
fig.update_xaxes(title='date')
fig.update_yaxes(title='Passengers')
fig.show()