In [1]:
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
data = pd.read_excel('train_last.xlsx',skiprows=5)
dat = data.drop(columns=['неделя'])
dat['Начало нед'] = pd.to_datetime(dat['Начало нед'].values)
dat = data.fillna(0)
val = dat.iloc[244:]
dat = dat.iloc[:244]

In [3]:
from statsmodels.tsa.stattools import adfuller
def ad_test(dataset):
    
    dftest = adfuller(dataset, autolag = 'AIC')
    
    print("1. ADF : ",dftest[0])
    print("2. P-Value : ", dftest[1])
    print("3. Num Of Lags : ", dftest[2])
    print("4. Num Of Observations Used For ADF Regression:",dftest[3])
    print("5. Critical Values :")
    for key, val in dftest[4].items():
         print("\t",key, ": ", val)
         
         
ad_test(dat['Продажи, рубли'])

1. ADF :  -4.188774186049275
2. P-Value :  0.000688330888402383
3. Num Of Lags :  3
4. Num Of Observations Used For ADF Regression: 240
5. Critical Values :
	 1% :  -3.4578942529658563
	 5% :  -2.8736593200231484
	 10% :  -2.573228767361111


In [5]:
for i in dat.columns:
    print(i)
    ad_test(dat[i])
    print()

неделя
1. ADF :  -3.7548194116863844
2. P-Value :  0.003404913548672678
3. Num Of Lags :  0
4. Num Of Observations Used For ADF Regression: 243
5. Critical Values :
	 1% :  -3.4575505077947746
	 5% :  -2.8735087323013526
	 10% :  -2.573148434859185

Начало нед
1. ADF :  0.4385313777194915
2. P-Value :  0.9828800781137217
3. Num Of Lags :  9
4. Num Of Observations Used For ADF Regression: 234
5. Critical Values :
	 1% :  -3.4586084859607156
	 5% :  -2.873972159235721
	 10% :  -2.57339565928848

Продажи, рубли
1. ADF :  -4.188774186049275
2. P-Value :  0.000688330888402383
3. Num Of Lags :  3
4. Num Of Observations Used For ADF Regression: 240
5. Critical Values :
	 1% :  -3.4578942529658563
	 5% :  -2.8736593200231484
	 10% :  -2.573228767361111

Дистрибуция Мирамистин
1. ADF :  -1.5814126886448314
2. P-Value :  0.4929234462380593
3. Num Of Lags :  5
4. Num Of Observations Used For ADF Regression: 238
5. Critical Values :
	 1% :  -3.458128284586202
	 5% :  -2.873761835239286
	 10% :  -2

  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2


In [6]:
train, test = train_test_split(dat,
                test_size=0.2,
                train_size=0.8,
                shuffle=True)
train = train.drop(columns=['Начало нед'])
test = test.drop(columns=['Начало нед'])
tr = Pool(train.drop(columns='Продажи, рубли'),label=train['Продажи, рубли'].values)
ts = Pool(test.drop(columns='Продажи, рубли'),label=test['Продажи, рубли'].values)

In [7]:
model = CatBoostRegressor(iterations=50_000,depth=12,random_seed=5,learning_rate=0.001,loss_function='RMSE',eval_metric='R2',grow_policy='Lossguide',)
mode = model.fit(tr,eval_set=ts,use_best_model=True,verbose_eval=True)

0:	learn: 0.0009679	test: -0.0597723	best: -0.0597723 (0)	total: 53.7ms	remaining: 44m 44s
1:	learn: 0.0017776	test: -0.0589082	best: -0.0589082 (1)	total: 57.5ms	remaining: 23m 56s
2:	learn: 0.0025674	test: -0.0580065	best: -0.0580065 (2)	total: 62.1ms	remaining: 17m 14s
3:	learn: 0.0035578	test: -0.0565502	best: -0.0565502 (3)	total: 67ms	remaining: 13m 57s
4:	learn: 0.0044034	test: -0.0552690	best: -0.0552690 (4)	total: 71.7ms	remaining: 11m 56s
5:	learn: 0.0053709	test: -0.0539066	best: -0.0539066 (5)	total: 77.4ms	remaining: 10m 45s
6:	learn: 0.0064029	test: -0.0527672	best: -0.0527672 (6)	total: 82.7ms	remaining: 9m 50s
7:	learn: 0.0072539	test: -0.0515140	best: -0.0515140 (7)	total: 87ms	remaining: 9m 3s
8:	learn: 0.0080870	test: -0.0502800	best: -0.0502800 (8)	total: 91ms	remaining: 8m 25s
9:	learn: 0.0089799	test: -0.0492166	best: -0.0492166 (9)	total: 93.6ms	remaining: 7m 47s
10:	learn: 0.0097719	test: -0.0481993	best: -0.0481993 (10)	total: 96.3ms	remaining: 7m 17s
11:	learn

In [9]:
dg = pd.read_csv('train/sample_submission.csv')
dg['revenue'] = mode.predict(val)
dg.to_csv('sub_91.csv',index=False)

In [10]:
mode.save_model('model_91',pool=tr)

In [22]:
mode.calc_feature_statistics(ts,plot=True)

{'неделя': {'borders': array([ 1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5,  8.5,  9.5, 10.5, 11.5,
         12.5, 13.5, 14.5, 15.5, 16.5, 17.5, 18.5, 19.5, 20.5, 21.5, 22.5,
         23.5, 24.5, 25.5, 26.5, 27.5, 28.5, 29.5, 30.5, 31.5, 32.5, 33.5,
         34.5, 35.5, 36.5, 37.5, 38.5, 39.5, 40.5, 41.5, 42.5, 43.5, 44.5,
         45.5, 46.5, 47.5, 48.5, 49.5, 50.5, 51.5, 52.5], dtype=float32),
  'binarized_feature': array([39,  6, 29,  3,  4, 41, 15, 22, 31, 16, 23, 20, 38,  8, 25,  3, 28,
         17, 35, 46, 28, 45, 32,  2, 31, 19, 50, 32, 24, 44, 40,  3, 13, 33,
         21, 11, 40, 48, 39,  5, 26, 11, 37, 11, 14, 15, 49, 25, 23]),
  'mean_target': array([       0.,        0., 57145180., 66434652., 71959912., 73662288.,
         67977752.,        0., 59969796.,        0.,        0., 61175180.,
                0., 60087984., 54465672., 60058552., 53522516., 54232472.,
                0., 45385736., 55115296., 56133392., 47373924., 52549712.,
         54604544., 49837560., 45910320.,   