In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
submission = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv')

train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

oil['date'] = pd.to_datetime(oil['date'])
transactions['date'] = pd.to_datetime(transactions['date'])

train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['weekday'] = train['date'].dt.weekday

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['weekday'] = test['date'].dt.weekday

train = pd.merge(train, stores, on='store_nbr', how='left')
test = pd.merge(test, stores, on='store_nbr', how='left')

In [2]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,year,month,day,weekday,city,state,type,cluster
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0,2013,1,1,1,Quito,Pichincha,D,13
1,1,2013-01-01,1,BABY CARE,0.000,0,2013,1,1,1,Quito,Pichincha,D,13
2,2,2013-01-01,1,BEAUTY,0.000,0,2013,1,1,1,Quito,Pichincha,D,13
3,3,2013-01-01,1,BEVERAGES,0.000,0,2013,1,1,1,Quito,Pichincha,D,13
4,4,2013-01-01,1,BOOKS,0.000,0,2013,1,1,1,Quito,Pichincha,D,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0,2017,8,15,1,Quito,Pichincha,B,6
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1,2017,8,15,1,Quito,Pichincha,B,6
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148,2017,8,15,1,Quito,Pichincha,B,6
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,2017,8,15,1,Quito,Pichincha,B,6


In [3]:
test_size = test.shape[0]
test_size

28512

In [4]:
df = pd.concat([train, test])

In [5]:
oil['ma7'] = oil['dcoilwtico'].rolling(window=7).mean()
oil['ma28'] = oil['dcoilwtico'].rolling(window=28).mean()
oil['ma7'] = oil['ma7'].fillna(oil['dcoilwtico'])
oil['ma28'] = oil['ma28'].fillna(oil['ma7'])

In [6]:
df = pd.merge(df, oil.drop(columns=['dcoilwtico']), on='date', how='left')

In [7]:
holidays = holidays.drop(columns='description')
holidays

Unnamed: 0,date,type,locale,locale_name,transferred
0,2012-03-02,Holiday,Local,Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,False
...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,False
346,2017-12-23,Additional,National,Ecuador,False
347,2017-12-24,Additional,National,Ecuador,False
348,2017-12-25,Holiday,National,Ecuador,False


In [8]:
holidays = pd.get_dummies(holidays, columns=['type','locale','locale_name'], drop_first=True)

In [9]:
holidays['date'] = pd.to_datetime(holidays['date'])
df = pd.merge(df, holidays, on='date', how='left')

In [10]:
transactions

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [11]:
df = df.drop(columns=['date'])

In [12]:
object_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=object_cols, drop_first=True)

In [13]:
train = df[:-test_size]
test = df[-test_size:]

In [14]:
train = train.drop(columns=['id'])
train.head()

Unnamed: 0,store_nbr,sales,onpromotion,year,month,day,weekday,cluster,ma7,ma28,...,locale_name_Machala_True,locale_name_Manta_True,locale_name_Puyo_True,locale_name_Quevedo_True,locale_name_Quito_True,locale_name_Riobamba_True,locale_name_Salinas_True,locale_name_Santa Elena_True,locale_name_Santo Domingo_True,locale_name_Santo Domingo de los Tsachilas_True
0,1,0.0,0,2013,1,1,1,13,,,...,False,False,False,False,False,False,False,False,False,False
1,1,0.0,0,2013,1,1,1,13,,,...,False,False,False,False,False,False,False,False,False,False
2,1,0.0,0,2013,1,1,1,13,,,...,False,False,False,False,False,False,False,False,False,False
3,1,0.0,0,2013,1,1,1,13,,,...,False,False,False,False,False,False,False,False,False,False
4,1,0.0,0,2013,1,1,1,13,,,...,False,False,False,False,False,False,False,False,False,False


In [15]:
test = test.drop(columns=['id', 'sales'])
test.head()

Unnamed: 0,store_nbr,onpromotion,year,month,day,weekday,cluster,ma7,ma28,family_BABY CARE,...,locale_name_Machala_True,locale_name_Manta_True,locale_name_Puyo_True,locale_name_Quevedo_True,locale_name_Quito_True,locale_name_Riobamba_True,locale_name_Salinas_True,locale_name_Santa Elena_True,locale_name_Santo Domingo_True,locale_name_Santo Domingo de los Tsachilas_True
3054348,1,0,2017,8,16,2,13,48.281429,47.708214,False,...,False,False,False,False,False,False,False,False,False,False
3054349,1,0,2017,8,16,2,13,48.281429,47.708214,True,...,False,False,False,False,False,False,False,False,False,False
3054350,1,2,2017,8,16,2,13,48.281429,47.708214,False,...,False,False,False,False,False,False,False,False,False,False
3054351,1,20,2017,8,16,2,13,48.281429,47.708214,False,...,False,False,False,False,False,False,False,False,False,False
3054352,1,0,2017,8,16,2,13,48.281429,47.708214,False,...,False,False,False,False,False,False,False,False,False,False


In [16]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [17]:
# x = train.drop(columns=['sales'])
# y = train['sales']

In [18]:
# x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2,shuffle=False, random_state=42)

In [19]:
# test = x_valid

In [20]:
X = train.drop(columns=['sales'])
y = train['sales']

y_log = np.log1p(y)

X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)

X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_val.columns = X_val.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
test.columns = test.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

X_train, test = X_train.align(test, join='left', axis=1, fill_value=0)
X_val, test = X_val.align(test, join='left', axis=1, fill_value=0)

train_pool = Pool(X_train, y_train)
val_pool = Pool(X_val, y_val)

In [21]:
iterations = 500
lr = 0.1

In [22]:
from catboost import CatBoostRegressor

catboost_model = CatBoostRegressor(
    iterations=iterations, 
    learning_rate=lr, 
    depth=8, 
    random_seed=42, 
    loss_function='RMSE',
    verbose=100
)

In [23]:
catboost_model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=50)

0:	learn: 2.5583322	test: 2.5573069	best: 2.5573069 (0)	total: 356ms	remaining: 2m 57s
50:	learn: 1.2692629	test: 1.2694886	best: 1.2694886 (50)	total: 15.1s	remaining: 2m 12s
100:	learn: 1.0320341	test: 1.0337973	best: 1.0337973 (100)	total: 29.8s	remaining: 1m 57s
150:	learn: 0.9169199	test: 0.9191389	best: 0.9191389 (150)	total: 45.2s	remaining: 1m 44s
200:	learn: 0.8469435	test: 0.8496671	best: 0.8496671 (200)	total: 1m	remaining: 1m 29s
300:	learn: 0.7566499	test: 0.7596787	best: 0.7596787 (300)	total: 1m 31s	remaining: 1m
350:	learn: 0.7249481	test: 0.7281173	best: 0.7281173 (350)	total: 1m 46s	remaining: 45.4s
400:	learn: 0.7008451	test: 0.7040945	best: 0.7040945 (400)	total: 2m 2s	remaining: 30.1s
450:	learn: 0.6809186	test: 0.6842862	best: 0.6842862 (450)	total: 2m 17s	remaining: 14.9s
499:	learn: 0.6628084	test: 0.6664430	best: 0.6664430 (499)	total: 2m 32s	remaining: 0us

bestTest = 0.6664429624
bestIteration = 499



<catboost.core.CatBoostRegressor at 0x7bf9efbbe6e0>

In [24]:
y_pred = catboost_model.predict(test)

In [25]:
y_pred

array([ 2.18570523, -0.01696998,  2.03305387, ...,  6.94053189,
        3.66603535,  2.65188611])

In [26]:
y_pred = np.expm1(y_pred)
y_pred = np.where(y_pred < 0, 0, y_pred)

In [27]:
# from sklearn.metrics import mean_squared_log_error

# msle = mean_squared_log_error(y_valid, y_pred)
# rmsle = np.sqrt(msle)

# print(rmsle)

In [28]:
# y_pred = catboost_model.predict(test)

In [29]:
# y_pred = np.where(y_pred < 0, 0, y_pred)

In [30]:
submission['sales'] = y_pred

In [31]:
submission

Unnamed: 0,id,sales
0,3000888,7.896921
1,3000889,0.000000
2,3000890,6.637374
3,3000891,2211.899355
4,3000892,0.063615
...,...,...
28507,3029395,390.821962
28508,3029396,78.798471
28509,3029397,1032.319677
28510,3029398,38.096594


In [32]:
submission.to_csv('submission.csv', index=False)