# Predict Future Sales
-  In this competition you will work with a challenging time-series dataset consisting of daily sales data,  
kindly provided by one of the largest Russian software firms. We are asking you to **predict total sales for every product and store in the next month.**  
- Data Exploration, Preprocessing, Xgb regressor
- Predict Future Sales, https://www.kaggle.com/c/competitive-data-science-predict-future-sales/overview  

### XGBregressor RMSE error   
- 1.43613

### Reference
- Simple Predict with Xgboost, https://www.kaggle.com/doukanberkberber/simple-predict-with-xgboost

## 1. Import Packages

In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [35]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

## 2. Load Dataset

In [5]:
train = pd.read_csv('sales_train.csv')
test = pd.read_csv('test.csv')
categories = pd.read_csv('item_categories.csv')
item = pd.read_csv('items.csv')
shop = pd.read_csv('shops.csv')

## 3. Make Dataset

In [6]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [11]:
train['date'] = pd.to_datetime(train.date,format="%d.%m.%Y")
train['ID'] = test.ID

In [12]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,ID
0,2013-01-02,0,59,22154,999.0,1.0,0.0
1,2013-01-03,0,25,2552,899.0,1.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0,2.0
3,2013-01-06,0,25,2554,1709.05,1.0,3.0
4,2013-01-15,0,25,2555,1099.0,1.0,4.0


In [14]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [15]:
train_id = train.ID
test_id = test.ID
y_sales = train.item_cnt_day

In [16]:
try:
    train.drop(labels=['ID','date','item_cnt_day'], axis=1, inplace=True)
except Exception as e:
    pass

In [17]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price
0,0,59,22154,999.0
1,0,25,2552,899.0
2,0,25,2552,899.0
3,0,25,2554,1709.05
4,0,25,2555,1099.0


In [18]:
try:
    test.drop(labels=['ID'], axis=1, inplace=True)
except Exception as e:
    pass

In [19]:
test.head()

Unnamed: 0,shop_id,item_id
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [20]:
con_data = pd.concat([train,test], ignore_index=True)
con_data.sample(5)

Unnamed: 0,date_block_num,shop_id,item_id,item_price
2337213,24.0,12,5490,1449.0
1678040,16.0,57,8754,399.0
643300,6.0,27,93,249.0
2072737,21.0,58,20949,5.0
1838351,18.0,58,4551,299.0


In [21]:
con_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price'], dtype='object')

- fill NAN

In [22]:
con_data["item_price"] = con_data["item_price"].fillna((con_data["item_price"].mode()[0] ))
con_data["date_block_num"] = con_data["date_block_num"].fillna((con_data["date_block_num"].mode()[0] ))
con_data.isna()
con_data.isnull().sum()

date_block_num    0
shop_id           0
item_id           0
item_price        0
dtype: int64

In [24]:
x_train = con_data[:len(train)]
x_test = con_data[len(train):]
train_x, test_x, train_y, test_y = train_test_split(x_train, y_sales,test_size = 0.2, random_state = 0)

## 4. Data Preprocessing with StandardScaler

In [25]:
from  sklearn.preprocessing  import StandardScaler
slc= StandardScaler()
train_x = slc.fit_transform(train_x)
x_test = slc.transform(x_test)
test_x = slc.transform(test_x)

In [51]:
print("Shape of train_x :", train_x.shape)
print("Shape of train_y :", train_y.shape)
print("Shape of test_x :", test_x.shape)
print("Shape of test_y :", test_y.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of test_id :", test_id.shape)

Shape of train_x : (2348679, 4)
Shape of train_y : (2348679,)
Shape of test_x : (587170, 4)
Shape of test_y : (587170,)
Shape of x_test : (214200, 4)
Shape of test_id : (214200,)


## 5. XGBRegressor

In [49]:
model = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, 
                     learning_rate = 0.1, max_depth = 10, alpha = 10, n_estimators = 70)

In [52]:
model.fit(train_x, train_y, verbose=1)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=10,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=70, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=10,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

## 6. Prediction and Submission

In [55]:
predictions = model.predict(x_test)
print("Shape of pred :", predictions.shape)

Shape of pred : (214200,)


In [56]:
submission = pd.DataFrame({
    "ID": test_id.index, 
    "item_cnt_month": predictions
})
submission.to_csv('simple_xgb_submission.csv', index=False)