# Hey Kagglers!! Let's  handle a time series prediction problem using ~~ARIMA~~ ML tree-based model 🌳

In [None]:
from IPython import display
display.Image("../input/salesforcast/Sales-Forecast.png", width=1000,height=500,)

***

**First lets get our tools ready**

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# ⚙️Data Preparation 

In [None]:
sales_train=pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
test=pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
sample_submission=pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
items=pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
items_category=pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shops=pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')

In [None]:
print("shape info of sales_train: {}".format(sales_train.shape))
print("shape info of test: {}".format(test.shape))
print("shape info of sample submission {}".format(sample_submission.shape))
print("shape info of items {}".format(items.shape))
print("shape info of items category {}".format(items_category.shape))
print("shape info of shops {}".format(shops.shape))

In [None]:
sales_train.info()

In [None]:
sales_train.head()

**Date_block_num simply represents the month  -1**

In [None]:
print(sales_train.head())
print("===============================================================")
print(test.head())
print("===============================================================")
print(items.head())
print("===============================================================")
print(items_category.head())
print("===============================================================")
print(shops.head())
print("===============================================================")

In [None]:
sales_train = sales_train.join(items, on = 'item_id', lsuffix = '_caller',rsuffix = '_other')

In [None]:
sales_train.drop(["item_id_other"],axis = 1, inplace = True)

In [None]:
sales_train.rename(columns={'item_id_caller': 'item_id'},
          inplace=True, errors='raise')

In [None]:
sales_train['tot_item'] = sales_train['item_price']*sales_train['item_cnt_day']

**Let's add a column for the `region name`**

In [None]:
shops[['region','name']] = shops['shop_name'].str.split(" ",1,expand = True)

***

# 🔎EDA

In [None]:
shops.groupby('region',as_index=False)['region'].size().plot(x = 'region',kind='bar');
plt.title('Number of shops in each region')


**We can see that "Mockba" has the most stores ... maybe that means that it has the most sales ??**

In [None]:
sales_train.head()

In [None]:
sales_train = sales_train.merge(shops,how='left',on='shop_id')

In [None]:
sales_train.head()


In [None]:
sales_train.columns

In [None]:
sales_train=sales_train[['date','date_block_num','shop_id','item_id','item_price','item_cnt_day','item_category_id','tot_item','region']]

In [None]:
sales_train['date'] = pd.to_datetime(sales_train['date'])

In [None]:
sales_train.set_index('date', inplace=True)


In [None]:
sales_train.head()

In [None]:
plt.figure(figsize=(50,30))

sales_train['item_cnt_day'].resample('M').agg(['sum']).plot(color='red')
plt.title('Total Items sold each month')
plt.xlabel('month')
# plt.ylabel('item count')
plt.show()

In [None]:
plt.figure(figsize=(50,30))

sales_train['item_cnt_day'].resample('Q').agg(['sum']).plot(color='red')
plt.title('Total Items sold each quarter')
plt.xlabel('month')
# plt.ylabel('item  count')
plt.show()

**We can clearly see that the sales are peaking at the end of each year ... or in the 4th quarter**

**Let's check if there any missing values**

In [None]:
print(sales_train.isnull().sum())
print()
print(test.isnull().sum())
print()
print(items.isnull().sum())
print()
print(items_category.isnull().sum())

In [None]:
sales_train.columns

In [None]:
plt.figure(figsize=(25,10))
plt.bar(sales_train.groupby('shop_id', as_index=False).agg({"item_cnt_day": "sum"}).index,sales_train.groupby('shop_id', as_index=False).agg({"item_cnt_day": "sum"})['item_cnt_day'])
plt.xlabel("Store", size = 15) 
# plt.ylabel("Sum of sales", size = 15) 
plt.title("Sum of sales per store" , size = 20)

**The sales distribution is generally healthy, we can see that store 31 and store 25 have the sales**

In [None]:
plt.figure(figsize=(25,10))
plt.bar(sales_train.groupby('item_category_id', as_index=False).agg({"item_cnt_day": "sum"}).index,sales_train.groupby('item_category_id', as_index=False).agg({"item_cnt_day": "sum"})['item_cnt_day'])
plt.xlabel("Item category", size = 15) 
# plt.ylabel("Sum of sales", size = 15) 
plt.title("Sum of sales per item category" , size = 20) 

**Some categories like "40" have way more sales than categories like "18"**

In [None]:
df = sales_train.groupby(['date_block_num','region'],as_index=False)[['item_cnt_day']].sum()

**Now let's see the monthly sales for all regions across the 3 years to see if there is any correlation between region and sales**

In [None]:
for i in df.date_block_num.unique():
    print(df[df['date_block_num'] == i].nlargest(5,'item_cnt_day'))

**We can clearly see that "Москва", which had the highest number of stores, is very dominant in terms of sales.            
Overall, we can say for now that there is a relation between the region and the amount of sales**

***

### Now let's try getting the data in the shape and granularity we need for modeling

In [None]:
sales_train.columns

In [None]:
sales_train.head()

In [None]:
agg = sales_train.groupby(['date_block_num','shop_id', 'item_id'], as_index=False).agg({"item_price":"mean","item_cnt_day": "sum"})
agg

In [None]:
agg = agg.join(shops, on = 'shop_id', lsuffix = '_caller',rsuffix = '_other')

In [None]:
agg.drop(['shop_name','shop_id_other','name'],axis=1,inplace=True)
agg.rename(columns={'shop_id_caller': 'shop_id'},
          inplace=True, errors='raise')

In [None]:
agg = agg.join(items, on = 'item_id', lsuffix = '_caller',rsuffix = '_other')

In [None]:
agg.drop(['item_name','item_id_other'],axis=1,inplace=True)
agg.rename(columns={'item_id_caller': 'item_id'},
          inplace=True, errors='raise')

In [None]:
agg.head()

In [None]:
agg = agg[['date_block_num','shop_id','item_id','region','item_category_id','item_cnt_day','item_price']]
# agg = agg.astype({"region":'category'})
agg

In [None]:
aggx = agg[['shop_id','item_id','region','item_category_id']]
aggy = agg['item_cnt_day']

**Lets prepare the test data as well !**

In [None]:
test

In [None]:
testing = test.copy()

In [None]:
testing['date_block_num']  = 34

In [None]:
testing.head()

In [None]:
testing = testing.join(shops, on = 'shop_id', lsuffix = '_caller',rsuffix = '_other')

In [None]:
testing.drop(['shop_name','shop_id_other','name'],axis=1,inplace=True)
testing.rename(columns={'shop_id_caller': 'shop_id'},
          inplace=True, errors='raise')

In [None]:
testing = testing.join(items, on = 'item_id', lsuffix = '_caller',rsuffix = '_other')

In [None]:
testing.drop(['item_name','item_id_other'],axis=1,inplace=True)
testing.rename(columns={'item_id_caller': 'item_id'},
          inplace=True, errors='raise')

In [None]:
testing.head()

In [None]:
testing = testing[['date_block_num','shop_id','item_id','region','item_category_id']]
testing = testing.astype({"region":'category'})

In [None]:
testing.head()

**We want to predict "item_cnt_day" for the coming month**

In [None]:
agg.head()

In [None]:
data = pd.concat([agg, testing])

In [None]:
data

**Note: normally I would do this preprocessing in an organized pipeline or using functions. but I decided to leave this code disjoint like this to make the process of forming the data frame we will use clear and understandable**

***

## Now let's prepare our lagged features that we are going to use in modeling

In [None]:
import sqlite3
with sqlite3.connect(":memory:") as conn:
    data.to_sql(name="datas", con=conn, index=False)

In [None]:
data.columns

In [None]:
ex1_sql_query = """
SELECT
    date_block_num,shop_id,item_id,region,item_category_id,item_cnt_day,
    lag(item_cnt_day) over ( partition by shop_id,item_id order by date_block_num) as lag1,
    lag(item_cnt_day,2) over ( partition by shop_id,item_id order by date_block_num) as lag2,
    lag(item_cnt_day,3) over ( partition by shop_id,item_id order by date_block_num) as lag3,
    lag(item_cnt_day,4) over ( partition by shop_id,item_id order by date_block_num) as lag4,
    lag(item_price) over ( partition by shop_id,item_id order by date_block_num) as lag_price,
    avg(item_cnt_day) over ( partition by shop_id,item_id order by date_block_num rows between unbounded preceding and 1 preceding ) as avg_lag,
    min(item_cnt_day) over ( partition by shop_id,item_id order by date_block_num rows between unbounded preceding and 1 preceding ) as min_lag,
    max(item_cnt_day) over ( partition by shop_id,item_id order by date_block_num rows between unbounded preceding and 1 preceding ) as max_lag
    --lead(item_cnt_day) over ( partition by shop_id,item_id order by date_block_num  ) as labels

FROM
    datas
order by shop_id,item_id,date_block_num
"""
ex1_sql = pd.read_sql(ex1_sql_query, con=conn)
ex1_sql

In [None]:
TRAIN=ex1_sql.loc[ex1_sql.date_block_num != 34]

In [None]:
Test=ex1_sql.loc[ex1_sql.date_block_num == 34]

In [None]:
Test.drop(['date_block_num','item_cnt_day'],axis=1,inplace=True)

In [None]:
TRAIN.drop(['date_block_num'],axis=1,inplace=True)

In [None]:
trainy=TRAIN['item_cnt_day']

In [None]:
trainx=TRAIN.drop(['item_cnt_day'],axis=1)

In [None]:
Test['region']=Test['region'].astype('category')
Test['shop_id']=Test['shop_id'].astype('category')
Test['item_id']=Test['item_id'].astype('category')

In [None]:
trainx['region']=trainx['region'].astype('category')
trainx['shop_id']=trainx['shop_id'].astype('category')
trainx['item_id']=trainx['item_id'].astype('category')

**Data is ready for modeling**

***

# 🛠Modeling

**Disclaimer: we are just testing out the approach of solving TS problems using tree-based models, so don't expect any fancy results since I will not be tuning the model**

In [None]:
display.Image("../input/lightgbm1/lightgbm.png",width=350,height=350)

**LightGBM is a gradient boosting framework that uses tree based learning algorithms.                     
The main difference between LightGBM and XGBoost is that in XGBoost, trees grow depth-wise while in LightGBM, trees grow leaf-wise**

In [None]:
display.Image("../input/lgbmvsxg/LightGBM vs XGboost.png",width=850,height=400)

In [None]:
lgbm = LGBMRegressor(verbose=1,iterations = 50000,device="GPU") 
lgbm.fit(trainx,trainy,categorical_feature = ['region','shop_id'])

In [None]:
sub=pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
sub.drop(['shop_id','item_id'],axis=1,inplace=True)


In [None]:
sub['item_cnt_month']=lgbm.predict(Test)


In [None]:
sub.to_csv('submission.csv', index=False)

**If you found this notebook helpful [Ceck out my preivous notebooks!](https://www.kaggle.com/abdelrahmanohassan/code)**

In [None]:
display.Image("../input/thank-you/thanks.jpg")