<a href="https://colab.research.google.com/github/varalakshmiarcot/Demand-Forecasting/blob/master/Demand_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#Importing libraries
import pandas as pd
import numpy as np


### Importing dataset

In [4]:
train = pd.read_csv('https://raw.githubusercontent.com/varalakshmiarcot/Demand-Forecasting/master/train_data.csv')
test = pd.read_csv('https://raw.githubusercontent.com/varalakshmiarcot/Demand-Forecasting/master/test_data.csv')

In [64]:
train.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
0,1,17/01/11,8091,216418,99.0375,111.8625,0,0,20
1,2,17/01/11,8091,216419,99.0375,99.0375,0,0,28
2,3,17/01/11,8091,216425,133.95,133.95,0,0,19
3,4,17/01/11,8091,216233,133.95,133.95,0,0,44
4,5,17/01/11,8091,217390,141.075,141.075,0,0,52


In [65]:
test.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku
0,212645,16/07/13,8091,216418,108.3,108.3,0,0
1,212646,16/07/13,8091,216419,109.0125,109.0125,0,0
2,212647,16/07/13,8091,216425,133.95,133.95,0,0
3,212648,16/07/13,8091,216233,133.95,133.95,0,0
4,212649,16/07/13,8091,217390,176.7,176.7,0,0


In [5]:
#Check datatypes
train.dtypes

record_ID            int64
week                object
store_id             int64
sku_id               int64
total_price        float64
base_price         float64
is_featured_sku      int64
is_display_sku       int64
units_sold           int64
dtype: object

In [6]:
# Change week into correct format
train['week'] = pd.to_datetime(train['week'])
test['week'] = pd.to_datetime(test['week'])

In [68]:
# Check null values
train.isnull().sum()

record_ID          0
week               0
store_id           0
sku_id             0
total_price        1
base_price         0
is_featured_sku    0
is_display_sku     0
units_sold         0
dtype: int64

### Drop null value


In [7]:
train.dropna(inplace=True)

### Extracting daycount from week



In [8]:
train["DayCount"] = train["week"].apply(lambda m: m.toordinal()/730000) 
test["DayCount"] = test["week"].apply(lambda m: m.toordinal()/730000)

In [73]:
train.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,DayCount
0,1,2011-01-17,8091,216418,99.0375,111.8625,0,0,20,1.00569
1,2,2011-01-17,8091,216419,99.0375,99.0375,0,0,28,1.00569
2,3,2011-01-17,8091,216425,133.95,133.95,0,0,19,1.00569
3,4,2011-01-17,8091,216233,133.95,133.95,0,0,44,1.00569
4,5,2011-01-17,8091,217390,141.075,141.075,0,0,52,1.00569


In [74]:
train.describe()

Unnamed: 0,record_ID,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,DayCount
count,150149.0,150149.0,150149.0,150149.0,150149.0,150149.0,150149.0,150149.0,150149.0
mean,106270.971795,9199.420935,254761.195226,206.626751,219.424262,0.095612,0.133201,51.674543,1.006317
std,61385.82558,615.593192,85547.587866,103.308516,110.960204,0.294059,0.339793,60.207962,0.000382
min,1.0,8023.0,216233.0,41.325,61.275,0.0,0.0,1.0,1.005678
25%,53111.0,8562.0,217217.0,130.3875,133.2375,0.0,0.0,20.0,1.005988
50%,106226.0,9371.0,222087.0,198.075,205.9125,0.0,0.0,35.0,1.006304
75%,159452.0,9731.0,245338.0,233.7,234.4125,0.0,0.0,62.0,1.006612
max,212644.0,9984.0,679023.0,562.1625,562.1625,1.0,1.0,2876.0,1.00713


### Outlier removal

In [9]:
train = train.drop(train[train['units_sold']>2500].index)

### Feature Engineering

In [10]:
train['diff'] = (train['base_price'] - train['total_price'])/train['base_price']

In [11]:
test['diff'] = (test['base_price'] - test['total_price'])/test['base_price']

In [12]:
features = [ 'store_id', 'sku_id','total_price','base_price',
       'is_featured_sku', 'is_display_sku','diff',
       'DayCount'] 

### Log Conversion of baseprice and total_price due to skewness

In [13]:
train['total_price'] = np.log(train['total_price'])
test['total_price'] = np.log(test['total_price'])

In [14]:
train.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,DayCount,diff
0,1,2011-01-17,8091,216418,4.595499,111.8625,0,0,20,1.00569,0.11465
1,2,2011-01-17,8091,216419,4.595499,99.0375,0,0,28,1.00569,0.0
2,3,2011-01-17,8091,216425,4.897467,133.95,0,0,19,1.00569,0.0
3,4,2011-01-17,8091,216233,4.897467,133.95,0,0,44,1.00569,0.0
4,5,2011-01-17,8091,217390,4.949292,141.075,0,0,52,1.00569,0.0


### Prepare dataset to train

In [15]:
from sklearn.model_selection import train_test_split,KFold
train.reset_index(inplace=True)
X = train[features]
y = train['units_sold']
y = np.log(y)

### Importing models (Xgboost and lightgbm)

In [16]:
import lightgbm as lgb

hyper_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l2', 'auc'],
    'learning_rate': 0.14,
    "max_depth": 10,
    "n_estimators": 1260,
    "subsample":0.8,
    "min_child_weight":5,
    "colsample_bytree":0.8,
    "eta":0.14,
    "alpha":0.05,
    
}

gbm = lgb.LGBMRegressor(**hyper_params)

In [18]:
from xgboost import XGBRegressor
xgb = XGBRegressor(
    max_depth=8,
    booster = "gbtree",
    n_estimators=1300,
    alpha=0.1,
    colsample_bytree=0.8,
    subsample=0.8,
    eta='0.3',
    min_child_weight=5, 
    metric=['l2','l1'],
    seed=42,tree_method='gpu_hist', gpu_id=0,
lambda_l2=0.01,early_stopping_rounds = 100)


#### BaggingRegressor used to boost the score

In [21]:
from sklearn.ensemble import BaggingRegressor
bag_xgb = BaggingRegressor(xgb, random_state=0,n_estimators=4)
bag_gbm = BaggingRegressor(gbm, random_state=0)

### Training the data

In [22]:
bag_gbm.fit(X,y)
bag_xgb.fit(X,y)



BaggingRegressor(base_estimator=XGBRegressor(alpha=0.1, base_score=0.5,
                                             booster='gbtree',
                                             colsample_bylevel=1,
                                             colsample_bynode=1,
                                             colsample_bytree=0.8,
                                             early_stopping_rounds=100,
                                             eta='0.3', gamma=0, gpu_id=0,
                                             importance_type='gain',
                                             lambda_l2=0.01, learning_rate=0.1,
                                             max_delta_step=0, max_depth=8,
                                             metric=['l2', 'l1'],
                                             min_child_weight=5, missing=None,
                                             n_estimators=1300, n_jobs=1,
                                             nthread=None,
                 

### Submission of file (Took ceil values of each output for both xgboost and lightgbm)

In [23]:
sample = pd.read_csv('https://raw.githubusercontent.com/varalakshmiarcot/Demand-Forecasting/master/sample_submission.csv')
sample['units_sold'] = np.ceil((np.ceil(np.exp((bag_xgb.predict(test[features]))))+np.ceil(np.exp(bag_gbm.predict(test[features]))))/2)
sample.loc[sample['units_sold']<0,'units_sold'] = abs(sample.loc[sample['units_sold']<0,'units_sold'])
sample.to_csv('submit.csv',index=False)