In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

path="/content/drive/MyDrive/Dataset/train.csv"
df_train = pd.read_csv(path)
df_train["date"] = pd.to_datetime(df_train["date"])

# Apply log transformation to 'sales' column with offset for zero and negative values
df_train['sales'] = np.log(np.where(df_train['sales'] > 0, df_train['sales'], 1))

df_train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,2.564949
1,2013-01-02,1,1,2.397895
2,2013-01-03,1,1,2.639057
3,2013-01-04,1,1,2.564949
4,2013-01-05,1,1,2.302585


In [None]:
#load test set
path2 = "/content/drive/MyDrive/Dataset/test.csv"
df_test = pd.read_csv(path2)
df_test["date"] = pd.to_datetime(df_test["date"])
df_test.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [None]:
# Concatenate the training and testing dataframes
df_combined = pd.concat([df_train, df_test]).reset_index(drop=True)

In [None]:
def smape(y_true, y_pred):
    return 100.0/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
def lgbm_smape(preds, train_data):
    #labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(train_data))
    return 'SMAPE', smape_val

In [None]:
# create feature from datetime columns
def create_date_features(dataframe):
    dataframe['month'] = dataframe.date.dt.month
    dataframe['day_of_month'] = dataframe.date.dt.day
    dataframe['day_of_year'] = dataframe.date.dt.dayofyear
    dataframe['week_of_year'] = dataframe.date.dt.weekofyear
    dataframe['day_of_week'] = dataframe.date.dt.dayofweek + 1
    dataframe['year'] = dataframe.date.dt.year
    dataframe['is_wknd'] = dataframe.date.dt.weekday // 4
    dataframe['is_month_start'] = dataframe.date.dt.is_month_start.astype(int)
    dataframe['is_month_end'] = dataframe.date.dt.is_month_end.astype(int)
    dataframe['quarter'] = dataframe.date.dt.quarter
    dataframe['week_block_num'] = [int(x) for x in np.floor((dataframe.date - pd.to_datetime('2012-12-31')).dt.days / 7) + 1]
    dataframe['quarter_block_num'] = (dataframe['year'] - 2013) * 4 + dataframe['quarter']
    dataframe['week_of_month'] = dataframe['week_of_year'].values // 4.35
    return dataframe


df_combined = create_date_features(df_combined)

# day labeling features
df_combined['is_Mon'] = np.where(df_combined['day_of_week'] == 1, 1, 0)
df_combined['is_Tue'] = np.where(df_combined['day_of_week'] == 2, 1, 0)
df_combined['is_Wed'] = np.where(df_combined['day_of_week'] == 3, 1, 0)
df_combined['is_Thu'] = np.where(df_combined['day_of_week'] == 4, 1, 0)
df_combined['is_Fri'] = np.where(df_combined['day_of_week'] == 5, 1, 0)
df_combined['is_Sat'] = np.where(df_combined['day_of_week'] == 6, 1, 0)
df_combined['is_Sun'] = np.where(df_combined['day_of_week'] == 7, 1, 0)

  dataframe['week_of_year'] = dataframe.date.dt.weekofyear


In [None]:
# One-Hot Encoding
df_dum = pd.get_dummies(df_combined[['store', 'item', 'day_of_week', 'month', ]], columns=['store', 'item', 'day_of_week', 'month', ], dummy_na=True)
df_combined = pd.concat([df_combined, df_dum], axis=1)

In [None]:
 #Create lagged features
for lag in [90, 180, 270, 360]:  # approximate days for 3, 6, 9 and 12 months
    df_combined[f'sales_lag_{lag}'] = df_combined.groupby(['store', 'item'])['sales'].shift(lag)

In [None]:
df_combined

Unnamed: 0,date,store,item,sales,id,month,day_of_month,day_of_year,week_of_year,day_of_week,...,month_8.0,month_9.0,month_10.0,month_11.0,month_12.0,month_nan,sales_lag_90,sales_lag_180,sales_lag_270,sales_lag_360
0,2013-01-01,1,1,2.564949,,1,1,1,1,2,...,0,0,0,0,0,0,,,,
1,2013-01-02,1,1,2.397895,,1,2,2,1,3,...,0,0,0,0,0,0,,,,
2,2013-01-03,1,1,2.639057,,1,3,3,1,4,...,0,0,0,0,0,0,,,,
3,2013-01-04,1,1,2.564949,,1,4,4,1,5,...,0,0,0,0,0,0,,,,
4,2013-01-05,1,1,2.302585,,1,5,5,1,6,...,0,0,0,0,0,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957995,2018-03-27,10,50,,44995.0,3,27,86,13,2,...,0,0,0,0,0,0,4.143135,4.406719,4.718499,4.615121
957996,2018-03-28,10,50,,44996.0,3,28,87,13,3,...,0,0,0,0,0,0,4.077537,4.499810,4.779123,4.700480
957997,2018-03-29,10,50,,44997.0,3,29,88,13,4,...,0,0,0,0,0,0,4.304065,4.634729,4.787492,4.158883
957998,2018-03-30,10,50,,44998.0,3,30,89,13,5,...,0,0,0,0,0,0,4.127134,4.595120,4.595120,4.663439


In [None]:
df_train = df_combined.loc[~df_combined.sales.isna()]

df_test = df_combined.loc[df_combined.id.notnull()]


In [None]:
df_train = df_train.drop("id",axis = 1)

In [None]:
df_train = df_train.dropna()
df_train

Unnamed: 0,date,store,item,sales,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,month_8.0,month_9.0,month_10.0,month_11.0,month_12.0,month_nan,sales_lag_90,sales_lag_180,sales_lag_270,sales_lag_360
360,2013-12-27,1,1,2.890372,12,27,361,52,5,2013,...,0,0,0,0,1,0,2.564949,3.178054,2.397895,2.564949
361,2013-12-28,1,1,2.833213,12,28,362,52,6,2013,...,0,0,0,0,1,0,2.995732,3.135494,2.944439,2.397895
362,2013-12-29,1,1,2.639057,12,29,363,52,7,2013,...,0,0,0,0,1,0,2.995732,2.833213,3.178054,2.639057
363,2013-12-30,1,1,1.609438,12,30,364,1,1,2013,...,0,0,0,0,1,0,2.484907,2.484907,2.890372,2.564949
364,2013-12-31,1,1,2.708050,12,31,365,1,2,2013,...,0,0,0,0,1,0,2.397895,3.178054,2.944439,2.302585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912995,2017-12-27,10,50,4.143135,12,27,361,52,3,2017,...,0,0,0,0,1,0,4.406719,4.718499,4.615121,4.234107
912996,2017-12-28,10,50,4.077537,12,28,362,52,4,2017,...,0,0,0,0,1,0,4.499810,4.779123,4.700480,3.688879
912997,2017-12-29,10,50,4.304065,12,29,363,52,5,2017,...,0,0,0,0,1,0,4.634729,4.787492,4.158883,4.143135
912998,2017-12-30,10,50,4.127134,12,30,364,52,6,2017,...,0,0,0,0,1,0,4.595120,4.595120,4.663439,4.143135


In [None]:
df_test

Unnamed: 0,date,store,item,sales,id,month,day_of_month,day_of_year,week_of_year,day_of_week,...,month_8.0,month_9.0,month_10.0,month_11.0,month_12.0,month_nan,sales_lag_90,sales_lag_180,sales_lag_270,sales_lag_360
913000,2018-01-01,1,1,,0.0,1,1,1,1,1,...,0,0,0,0,0,0,2.890372,3.135494,3.218876,3.178054
913001,2018-01-02,1,1,,1.0,1,2,2,1,2,...,0,0,0,0,0,0,2.708050,3.433987,3.218876,2.639057
913002,2018-01-03,1,1,,2.0,1,3,3,1,3,...,0,0,0,0,0,0,2.995732,3.465736,3.135494,2.995732
913003,2018-01-04,1,1,,3.0,1,4,4,1,4,...,0,0,0,0,0,0,2.944439,3.295837,2.995732,2.890372
913004,2018-01-05,1,1,,4.0,1,5,5,1,5,...,0,0,0,0,0,0,3.091042,3.465736,2.944439,2.397895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957995,2018-03-27,10,50,,44995.0,3,27,86,13,2,...,0,0,0,0,0,0,4.143135,4.406719,4.718499,4.615121
957996,2018-03-28,10,50,,44996.0,3,28,87,13,3,...,0,0,0,0,0,0,4.077537,4.499810,4.779123,4.700480
957997,2018-03-29,10,50,,44997.0,3,29,88,13,4,...,0,0,0,0,0,0,4.304065,4.634729,4.787492,4.158883
957998,2018-03-30,10,50,,44998.0,3,30,89,13,5,...,0,0,0,0,0,0,4.127134,4.595120,4.595120,4.663439


In [None]:
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor

# Number of splits
n_splits = 5
# Initialize TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=n_splits)

model = LGBMRegressor(learning_rate = 0.1)
df_fc = df_train.copy()
smape_values = []

# Perform cross-validation
for train_index, test_index in tscv.split(df_train):
    CV_train, CV_test = df_train.iloc[train_index], df_train.iloc[test_index]

    # Fit the model on the training data
    model.fit(CV_train.drop(["sales", "date"], axis=1), CV_train["sales"])

    # Predict on the test data
    predictions = model.predict(CV_test.drop(["sales", "date"], axis=1))
    df_fc.loc[df_train.iloc[test_index].index,"predictions"] = predictions[0]
    # Calculate SMAPE and add it to the list of SMAPE values
    smape_value = lgbm_smape(CV_test["sales"].values, predictions)
    smape_values.append(smape_value)

# Print the average SMAPE value across all folds
smape_values

[('SMAPE', 15.612612092211954),
 ('SMAPE', 14.145399423968303),
 ('SMAPE', 14.287412070655652),
 ('SMAPE', 15.582493811200143),
 ('SMAPE', 14.913974323237452)]

In [None]:
df_fc

Unnamed: 0,date,store,item,sales,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,month_9.0,month_10.0,month_11.0,month_12.0,month_nan,sales_lag_90,sales_lag_180,sales_lag_270,sales_lag_360,predictions
360,2013-12-27,1,1,2.890372,12,27,361,52,5,2013,...,0,0,0,1,0,2.564949,3.178054,2.397895,2.564949,
361,2013-12-28,1,1,2.833213,12,28,362,52,6,2013,...,0,0,0,1,0,2.995732,3.135494,2.944439,2.397895,
362,2013-12-29,1,1,2.639057,12,29,363,52,7,2013,...,0,0,0,1,0,2.995732,2.833213,3.178054,2.639057,
363,2013-12-30,1,1,1.609438,12,30,364,1,1,2013,...,0,0,0,1,0,2.484907,2.484907,2.890372,2.564949,
364,2013-12-31,1,1,2.708050,12,31,365,1,2,2013,...,0,0,0,1,0,2.397895,3.178054,2.944439,2.302585,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912995,2017-12-27,10,50,4.143135,12,27,361,52,3,2017,...,0,0,0,1,0,4.406719,4.718499,4.615121,4.234107,3.286291
912996,2017-12-28,10,50,4.077537,12,28,362,52,4,2017,...,0,0,0,1,0,4.499810,4.779123,4.700480,3.688879,3.286291
912997,2017-12-29,10,50,4.304065,12,29,363,52,5,2017,...,0,0,0,1,0,4.634729,4.787492,4.158883,4.143135,3.286291
912998,2017-12-30,10,50,4.127134,12,30,364,52,6,2017,...,0,0,0,1,0,4.595120,4.595120,4.663439,4.143135,3.286291


In [None]:
df_train

Unnamed: 0,date,store,item,sales,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,month_8.0,month_9.0,month_10.0,month_11.0,month_12.0,month_nan,sales_lag_90,sales_lag_180,sales_lag_270,sales_lag_360
360,2013-12-27,1,1,2.890372,12,27,361,52,5,2013,...,0,0,0,0,1,0,2.564949,3.178054,2.397895,2.564949
361,2013-12-28,1,1,2.833213,12,28,362,52,6,2013,...,0,0,0,0,1,0,2.995732,3.135494,2.944439,2.397895
362,2013-12-29,1,1,2.639057,12,29,363,52,7,2013,...,0,0,0,0,1,0,2.995732,2.833213,3.178054,2.639057
363,2013-12-30,1,1,1.609438,12,30,364,1,1,2013,...,0,0,0,0,1,0,2.484907,2.484907,2.890372,2.564949
364,2013-12-31,1,1,2.708050,12,31,365,1,2,2013,...,0,0,0,0,1,0,2.397895,3.178054,2.944439,2.302585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912995,2017-12-27,10,50,4.143135,12,27,361,52,3,2017,...,0,0,0,0,1,0,4.406719,4.718499,4.615121,4.234107
912996,2017-12-28,10,50,4.077537,12,28,362,52,4,2017,...,0,0,0,0,1,0,4.499810,4.779123,4.700480,3.688879
912997,2017-12-29,10,50,4.304065,12,29,363,52,5,2017,...,0,0,0,0,1,0,4.634729,4.787492,4.158883,4.143135
912998,2017-12-30,10,50,4.127134,12,30,364,52,6,2017,...,0,0,0,0,1,0,4.595120,4.595120,4.663439,4.143135


In [None]:
df_test

Unnamed: 0,date,store,item,sales,id,month,day_of_month,day_of_year,week_of_year,day_of_week,...,month_8.0,month_9.0,month_10.0,month_11.0,month_12.0,month_nan,sales_lag_90,sales_lag_180,sales_lag_270,sales_lag_360
913000,2018-01-01,1,1,,0.0,1,1,1,1,1,...,0,0,0,0,0,0,2.890372,3.135494,3.218876,3.178054
913001,2018-01-02,1,1,,1.0,1,2,2,1,2,...,0,0,0,0,0,0,2.708050,3.433987,3.218876,2.639057
913002,2018-01-03,1,1,,2.0,1,3,3,1,3,...,0,0,0,0,0,0,2.995732,3.465736,3.135494,2.995732
913003,2018-01-04,1,1,,3.0,1,4,4,1,4,...,0,0,0,0,0,0,2.944439,3.295837,2.995732,2.890372
913004,2018-01-05,1,1,,4.0,1,5,5,1,5,...,0,0,0,0,0,0,3.091042,3.465736,2.944439,2.397895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957995,2018-03-27,10,50,,44995.0,3,27,86,13,2,...,0,0,0,0,0,0,4.143135,4.406719,4.718499,4.615121
957996,2018-03-28,10,50,,44996.0,3,28,87,13,3,...,0,0,0,0,0,0,4.077537,4.499810,4.779123,4.700480
957997,2018-03-29,10,50,,44997.0,3,29,88,13,4,...,0,0,0,0,0,0,4.304065,4.634729,4.787492,4.158883
957998,2018-03-30,10,50,,44998.0,3,30,89,13,5,...,0,0,0,0,0,0,4.127134,4.595120,4.595120,4.663439


In [None]:
df_test = df_test.drop("id", axis = 1)

In [None]:
df_test_copy = df_test.drop(["sales","date"], axis = 1)

In [None]:
predictions = []

# Create a separate DataFrame to store the lagged predictions
#lagged_predictions = df_test_copy.copy()

# Iterate over the test set
for i in range(len(df_test_copy)):
    # Prepare the data for the current day, including lagged features
    data = lagged_predictions.iloc[i:i+1].copy()

    # Make a prediction for the current day
    prediction = model.predict(data)

    # Store the prediction
    predictions.append(prediction[0])

predictions = pd.Series(predictions)

In [None]:
predictions

0        2.588074
1        2.637271
2        2.666420
3        2.680126
4        2.730621
           ...   
44995    4.314907
44996    4.319088
44997    4.325453
44998    4.546025
44999    4.439305
Length: 45000, dtype: float64

In [None]:
np.exp(predictions)

0        13.304118
1        13.975017
2        14.388361
3        14.586929
4        15.342406
           ...    
44995    74.806660
44996    75.120069
44997    75.599773
44998    94.257006
44999    84.716016
Length: 45000, dtype: float64

In [None]:
#load test set
path2 = "/content/drive/MyDrive/Dataset/test.csv"
df_test = pd.read_csv(path2)
df_test["date"] = pd.to_datetime(df_test["date"])
df_test.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [None]:
# Add predictions to the test dataframe
df_test['predictions'] = np.exp(predictions.values)
df_test



Unnamed: 0,id,date,store,item,predictions
0,0,2018-01-01,1,1,13.304118
1,1,2018-01-02,1,1,13.975017
2,2,2018-01-03,1,1,14.388361
3,3,2018-01-04,1,1,14.586929
4,4,2018-01-05,1,1,15.342406
...,...,...,...,...,...
44995,44995,2018-03-27,10,50,74.806660
44996,44996,2018-03-28,10,50,75.120069
44997,44997,2018-03-29,10,50,75.599773
44998,44998,2018-03-30,10,50,94.257006


In [None]:
# Save the predicted values along with other attributes to a CSV file
df_test[['date', 'store', 'item', 'predictions']].to_csv('predicted_values.csv', index=False)
