In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [2]:
train_data = pd.read_csv('../data/train_data_modified.csv')
train_data.drop(['CategoryCode', 'Week'], axis = 1, inplace = True)
train_data.head()

Unnamed: 0,ItemCode,id,WeeklySales
0,174436,1,35
1,118033,1,7
2,20824,1,53
3,1061341,1,18
4,76399,1,21


In [3]:
valid_data = pd.read_csv('../data/validation_data.csv')
valid_data.head()

Unnamed: 0,CategoryCode,ItemCode,Week,WeeklySales
0,category_2,1044502,w1,11
1,category_2,1105009,w1,11
2,category_2,913561,w4,5
3,category_1,1048975,w4,30
4,category_1,17287,w2,60


In [4]:
def week_id_valid(week):
    if week=="w1":
        return 1
    elif week=="w2":
        return 2
    elif week=="w3":
        return 3
    else: return 4

valid_data['id'] = [week_id_valid(week)+20 for week in valid_data['Week']]
valid_data

Unnamed: 0,CategoryCode,ItemCode,Week,WeeklySales,id
0,category_2,1044502,w1,11,21
1,category_2,1105009,w1,11,21
2,category_2,913561,w4,5,24
3,category_1,1048975,w4,30,24
4,category_1,17287,w2,60,22
...,...,...,...,...,...
365,category_2,124954,w2,43,22
366,category_2,40759,w1,48,21
367,category_1,1090303,w1,19,21
368,category_2,1090276,w3,6,23


In [5]:
valid_data.sort_values(by = 'id', inplace = True)
valid_data.head()

Unnamed: 0,CategoryCode,ItemCode,Week,WeeklySales,id
0,category_2,1044502,w1,11,21
142,category_1,48940,w1,22,21
269,category_1,35584,w1,15,21
138,category_2,59749,w1,33,21
137,category_2,132028,w1,112,21


In [6]:
valid_data.drop(['CategoryCode', 'Week'], axis = 1, inplace = True)

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   ItemCode     3772 non-null   int64
 1   id           3772 non-null   int64
 2   WeeklySales  3772 non-null   int64
dtypes: int64(3)
memory usage: 88.5 KB


In [8]:
valid_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 370 entries, 0 to 369
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   ItemCode     370 non-null    int64
 1   WeeklySales  370 non-null    int64
 2   id           370 non-null    int64
dtypes: int64(3)
memory usage: 11.6 KB


In [9]:
train_val_data = pd.concat([train_data, valid_data])
train_val_data.tail()

Unnamed: 0,ItemCode,id,WeeklySales
139,118033,24,12
140,1098502,24,4
268,1010068,24,19
146,211309,24,23
369,3418,24,69


In [10]:
train_val_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4142 entries, 0 to 369
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   ItemCode     4142 non-null   int64
 1   id           4142 non-null   int64
 2   WeeklySales  4142 non-null   int64
dtypes: int64(3)
memory usage: 129.4 KB


In [19]:
melt2 = train_val_data.copy()

In [20]:
melt2['Last_Week_Sales'] = melt2.groupby(['ItemCode'])['WeeklySales'].shift(periods = 1)

In [23]:
melt2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4142 entries, 0 to 369
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ItemCode         4142 non-null   int64  
 1   id               4142 non-null   int64  
 2   WeeklySales      4142 non-null   int64  
 3   Last_Week_Sales  3948 non-null   float64
 4   Last_Week_Diff   3754 non-null   float64
dtypes: float64(2), int64(3)
memory usage: 194.2 KB


In [22]:
melt2['Last_Week_Diff'] = melt2.groupby(['ItemCode'])['WeeklySales'].shift(periods = 1) - melt2.groupby(['ItemCode'])['WeeklySales'].shift(periods = 2)

In [24]:
melt2.tail()

Unnamed: 0,ItemCode,id,WeeklySales,Last_Week_Sales,Last_Week_Diff
139,118033,24,12,9.0,-4.0
140,1098502,24,4,7.0,1.0
268,1010068,24,19,30.0,12.0
146,211309,24,23,18.0,5.0
369,3418,24,69,120.0,51.0


In [25]:
melt2 = melt2.dropna()
melt2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3754 entries, 367 to 369
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ItemCode         3754 non-null   int64  
 1   id               3754 non-null   int64  
 2   WeeklySales      3754 non-null   int64  
 3   Last_Week_Sales  3754 non-null   float64
 4   Last_Week_Diff   3754 non-null   float64
dtypes: float64(2), int64(3)
memory usage: 176.0 KB


In [26]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [27]:
mean_error = []
for week in range(21,25):
    train = melt2[melt2['id'] < week]
    val = melt2[melt2['id'] == week]
    p = val['Last_Week_Sales'].values

    error = rmsle(val['WeeklySales'].values, p)
    print('Week %d - Error %.5f' % (week, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Week 21 - Error 0.63727
Week 22 - Error 0.62979
Week 23 - Error 0.69828
Week 24 - Error 0.67142
Mean Error = 0.65919


In [54]:
melt2['ItemCode'].values

array([ 371239,   88450, 1024810, ..., 1010068,  211309,    3418])

In [60]:
new_data = {}
mean_error = []
for week in range(21,22):
    train = melt2[melt2['id'] < week]
    val = melt2[melt2['id'] == week]
    val_item_code = val['ItemCode'].values
    val_id = val['id'].values
    print(val_item_code)

    xtr, xts = train.drop(['WeeklySales'], axis=1), val.drop(['WeeklySales'], axis=1)
    ytr, yts = train['WeeklySales'].values, val['WeeklySales'].values
    print(len(yts))

    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)

    p = mdl.predict(xts)
    print(len(p))
    
    new_data = {
        'ItemCode': val_item_code,
        'id': val_id,
        'Sales': p
    }
    
    

    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (week, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

[1044502   48940   35584   59749  132028  397213 1084498  210868 1032586
  371239  267496 1076920 1044682   99079 1048975 1076938 1015621  124954
  142756  172033   20824    3427 1047130  836125   86974  172582  173617
 1044619 1071106   30877 1026871 1098502 1032550 1090240 1097143  399220
 1050046 1090276 1101661  118033  913561  124774  110320  745945 1067092
   43657 1060909  131983 1013335 1081087    3418  211309 1090105 1066570
 1071124 1070836 1090294   23569  245581 1058713 1064473    7666  138742
   17287   59047 1030948   50785  865933   17296 1064572 1006009  906586
  248272 1006108   64978  877624 1090114 1074823 1010068 1105009 1090303
   40759  836152 1098493  755584 1032541   75886  416212   24136  839356
  379249]
91
91
Week 21 - Error 0.65597
Mean Error = 0.65597


In [61]:
pd.DataFrame(new_data)

Unnamed: 0,ItemCode,id,Sales
0,1044502,21,16.877
1,48940,21,21.520
2,35584,21,11.745
3,59749,21,27.658
4,132028,21,136.628
...,...,...,...
86,75886,21,11.191
87,416212,21,11.272
88,24136,21,26.140
89,839356,21,74.896


In [None]:
def add_week_sales(dataframe, new_values, id_num):
    

### Submission dataset

In [76]:
test_data = pd.read_csv('../data/test_data.csv')
test_data.head()

Unnamed: 0,CategoryCode,ItemCode,Week,PredictedSales
0,category_1,43738,w4,
1,category_2,1006090,w1,
2,category_2,1076929,w4,
3,category_1,1081321,w3,
4,category_2,216151,w4,


In [77]:
test_data.rename(columns = {'PredictedSales': 'WeeklySales'}, inplace = True)

In [78]:
def week_id_test(week):
    if week=="w1":
        return 1
    elif week=="w2":
        return 2
    elif week=="w3":
        return 3
    else: return 4

test_data['id'] = [week_id_valid(week)+20 for week in test_data['Week']]
test_data

Unnamed: 0,CategoryCode,ItemCode,Week,WeeklySales,id
0,category_1,43738,w4,,24
1,category_2,1006090,w1,,21
2,category_2,1076929,w4,,24
3,category_1,1081321,w3,,23
4,category_2,216151,w4,,24
...,...,...,...,...,...
372,category_2,1101571,w1,,21
373,category_2,1090258,w4,,24
374,category_2,906595,w1,,21
375,category_2,32245,w1,,21


In [79]:
test_data.drop(['CategoryCode', 'Week'], axis = 1, inplace = True)

In [80]:
test_data.head()

Unnamed: 0,ItemCode,WeeklySales,id
0,43738,,24
1,1006090,,21
2,1076929,,24
3,1081321,,23
4,216151,,24


In [81]:
test_data.sort_values(by = ['id','ItemCode'], inplace = True)

In [82]:
test_data.head(30)

Unnamed: 0,ItemCode,WeeklySales,id
343,9925,,21
360,16936,,21
7,23200,,21
375,32245,,21
277,35449,,21
117,35530,,21
270,36898,,21
179,37510,,21
58,37861,,21
318,38518,,21


In [87]:
test_data_21 = test_data[test_data['id'] == 21]

In [92]:
train_data.sort_values(by = ['id', 'ItemCode'], inplace = True)

In [88]:
train_data.head()

Unnamed: 0,ItemCode,id,WeeklySales
0,174436,1,35
1,118033,1,7
2,20824,1,53
3,1061341,1,18
4,76399,1,21


In [89]:
train_data_21 = pd.concat([train_data, test_data_21])

In [93]:
train_data_21.tail(100)

Unnamed: 0,ItemCode,id,WeeklySales
3767,1044619,20,45.0
3768,23200,20,22.0
3769,1084498,20,7.0
3770,872260,20,12.0
3771,1024810,20,21.0
...,...,...,...
372,1101571,21,
88,1101769,21,
340,1103056,21,
251,1105018,21,


In [None]:
new_data = {}
mean_error = []
for week in range(21,22):
    train = melt2[melt2['id'] < week]
    val = melt2[melt2['id'] == week]
    val_item_code = val['ItemCode'].values
    val_id = val['id'].values
    print(val_item_code)

    xtr, xts = train.drop(['WeeklySales'], axis=1), val.drop(['WeeklySales'], axis=1)
    ytr, yts = train['WeeklySales'].values, val['WeeklySales'].values
    print(len(yts))

    mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)

    p = mdl.predict(xts)
    print(len(p))
    
    new_data = {
        'ItemCode': val_item_code,
        'id': val_id,
        'Sales': p
    }
    
    

    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (week, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

In [36]:
test_data.drop(['PredictedSales'], axis = 1, inplace = True)

In [43]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 377 entries, 0 to 376
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   ItemCode  377 non-null    int64
 1   id        377 non-null    int64
dtypes: int64(2)
memory usage: 6.0 KB


### Training full model

In [38]:
train_val_data.head()

Unnamed: 0,ItemCode,id,WeeklySales
0,174436,1,35
1,118033,1,7
2,20824,1,53
3,1061341,1,18
4,76399,1,21


In [44]:
xtr= melt2.drop(['WeeklySales'], axis=1)

In [45]:
ytr = melt2['WeeklySales'].values

In [46]:
mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)

In [47]:
mdl.fit(xtr, ytr)

RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)

In [48]:
pred = mdl.predict(test_data)

In [51]:
pred

array([  8.658,   3.941,   5.834,  24.292,  17.273,   7.506,  10.571,
        11.538,   3.293,  12.429,   7.69 ,  16.367, 343.149,  52.769,
        24.801,  26.747,   8.695,   9.714,  12.718,   8.837,   4.605,
        10.068,  22.859,  18.335,  12.917,  47.739,  29.569,  20.353,
        12.1  ,  10.238,  15.013,  15.495,  14.218,  12.242,  24.212,
        89.039,  12.836,  30.342,   6.106, 725.297,  30.911,   4.103,
        30.895,   6.403,   6.005,   5.571,   3.789,   9.318,   9.601,
         8.345,  10.826,  17.564,  29.109,   6.533,   4.565,  88.238,
         5.97 ,  30.327,  96.599,  12.161,  16.609, 153.662,  21.011,
         4.854,  14.747,   6.11 ,   3.385,  31.053,  39.212,  35.614,
        33.59 ,   3.292,   5.174,   9.938,  48.621,  29.173,   6.778,
        46.957,  55.478,  65.954,  14.047,   3.032,  43.411,   6.123,
         5.951, 136.039,  55.569,  41.086,   6.844,  22.168,  12.34 ,
         6.012,  16.01 ,   7.865,  12.75 ,  11.037,  20.53 ,  14.343,
        12.155,  11.

In [52]:
pd.DataFrame(pred).to_csv('predictions_test.csv')