In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import the packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation

In [None]:
from xgboost import XGBRegressor
from string import punctuation
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

shops df

In [None]:
shops = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
shops.shape

item_category

In [None]:
items_category = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
items_category.shape

items df

In [None]:
items = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
items.shape

In [None]:
# Let's see the top 10 and bottom 10 item categories
items_gb = items.groupby("item_category_id").size().to_frame()

In [None]:
items[items["item_category_id"] == 60]

sales df

In [None]:
sales = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
sales.shape

In [None]:
# let's correct the shops df and also generate a few more features
def fix_shops(shops):
    '''
    This function modifies the shops df inplace.
    It correct's 3 shops that we have found to be 'duplicates'
    and also creates a few more features: extracts the city and encodes it using LabelEncoder
    '''
    
    d = {0:57, 1:58, 10:11, 23:24}
    
    # this 'tricks' allows you to map a series to a dictionary, but all values that are not in the dictionary won't be affected
    # it's handy since if we blindly map the values, the missings values will be replaced with nan
    shops["shop_id"] = shops["shop_id"].apply(lambda x: d[x] if x in d.keys() else x)
    
    # replace all the punctuation in the shop_name columns
    shops["shop_name_cleaned"] = shops["shop_name"].apply(lambda s: "".join([x for x in s if x not in punctuation]))
    
    # extract the city name
    shops["city"] = shops["shop_name_cleaned"].apply(lambda s: s.split()[0])
    # encode it using a simple LabelEncoder
    shops["city_id"] = LabelEncoder().fit_transform(shops['city'])

In [None]:
# a simple function that creates a global df with all joins and also shops corrections
def create_df():
    '''
    This is a helper function that creates the train df.
    '''
    # import all df
    shops = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
    fix_shops(shops) # fix the shops as we have seen before
    
    items_category = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
    items = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
    sales = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
        # fix shop_id in sales so that we can later merge the df
    d = {0:57, 1:58, 10:11, 23:24}
    sales["shop_id"] = sales["shop_id"].apply(lambda x: d[x] if x in d.keys() else x)
    
    # create df by merging the previous dataframes
    df = pd.merge(items, items_category, left_on = "item_category_id", right_on = "item_category_id")
    df = pd.merge(sales, df, left_on = "item_id", right_on = "item_id")
    df = pd.merge(df, shops, left_on = "shop_id", right_on = "shop_id")
    
    # convert to datetime and sort the values
#     df["date"] = pd.to_datetime(df["date"], format = "%d.%m.%Y")
    df.sort_values(by = ["shop_id", "date"], ascending = True, inplace = True)
    
    return df

In [None]:
df = create_df()
df.shape

In [None]:
# calculate the monthly sales
df["date"] = pd.to_datetime(df["date"], format = "%d.%m.%Y")

df["Year"] = df["date"].dt.year
df["Month"] = df["date"].dt.month

In [None]:
import plotly.express as px # library for interactive plots

from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
# resample the data on a monthly basis
x = df[["date", "item_cnt_day"]].set_index("date").resample("M").sum()

fig = px.line(x, x=x.index, y=x["item_cnt_day"], title="Monthly sales",labels={
                     "item_cnt_day": ""
                 },)
fig.show()

In [None]:
# perform the same operations but on a weekly basis
x = df[["date", "item_cnt_day"]].set_index("date").resample("W").sum()

fig = px.line(x, x=x.index, y=x["item_cnt_day"], title="Weekly sales", labels={
                     "item_cnt_day": ""})
fig.show()

Represent the monthly sales (left plot) and weekly sales (right plot) for each shop

In [None]:
import datetime
# Mark the national holidays in Russia and see if there is any connection with sales spikes.
russian_holidays_start = [
datetime.datetime(2013, 1, 1),
datetime.datetime(2013, 2, 23),
datetime.datetime(2013, 3, 8),
datetime.datetime(2013, 5, 1),
datetime.datetime(2013, 5, 9),
datetime.datetime(2013, 6, 12),
datetime.datetime(2013, 11, 4),

datetime.datetime(2014, 1, 1),
datetime.datetime(2014, 2, 23),
datetime.datetime(2014, 3, 8),
datetime.datetime(2014, 5, 1),
datetime.datetime(2014, 5, 9),
datetime.datetime(2014, 6, 12),
datetime.datetime(2014, 11, 4),
    
datetime.datetime(2015, 1, 1),
datetime.datetime(2015, 2, 23),
datetime.datetime(2015, 3, 8),
datetime.datetime(2015, 5, 1),
datetime.datetime(2015, 5, 9),
datetime.datetime(2015, 6, 12),
datetime.datetime(2015, 11, 4)
]

russian_holidays_end = [
datetime.datetime(2013, 1, 8),
datetime.datetime(2013, 2, 23),
datetime.datetime(2013, 3, 8),
datetime.datetime(2013, 5, 1),
datetime.datetime(2013, 5, 9),
datetime.datetime(2013, 6, 12),
datetime.datetime(2013, 11, 4),

datetime.datetime(2014, 1, 8),
datetime.datetime(2014, 2, 23),
datetime.datetime(2014, 3, 8),
datetime.datetime(2014, 5, 1),
datetime.datetime(2014, 5, 9),
datetime.datetime(2014, 6, 12),
datetime.datetime(2014, 11, 4),

datetime.datetime(2015, 1, 8),
datetime.datetime(2015, 2, 23),
datetime.datetime(2015, 3, 8),
datetime.datetime(2015, 5, 1),
datetime.datetime(2015, 5, 9),
datetime.datetime(2015, 6, 12),
datetime.datetime(2015, 11, 4)
]

In [None]:
for iterable in sorted(list(df["shop_name"].unique())[:5]):

    # create the size of the figure
    #plt.figure(figsize = (30, 10))

    
    shapes = []
    for start_date, end_date in zip(russian_holidays_start, russian_holidays_end):
        
    # add shaded areas for holidays 2013
        #ax.axvspan(start_date, end_date, alpha = alpha, color = 'red')   
        shapes.append({
      "x0": start_date, 
      "x1": end_date, 
      "y0": 0, 
      "y1": 1, 
      "type": "rect", 
      "xref": "x", 
      "yref": "paper", 
      "opacity": 0.8, 
      "fillcolor": "#d3d3d3",
        "line_width":0.1,
        })
    
    
    # create the subplot for Monthly sales of the each shop
    #plt.subplot(1, 2, 1)
    
    #fig = make_subplots(rows=1, cols=2, layout=layout)
    
    # calculate the Monthly sales of each shop
    short_df = df[df["shop_name"] == iterable][["date","item_cnt_day"]]
    short_df["date"] = pd.to_datetime(short_df["date"], format = "%d.%m.%Y")
    short_df["YEAR"] = short_df["date"].dt.year
    short_df = short_df.set_index("date").groupby("YEAR").resample("M")["item_cnt_day"].sum()
    short_df = short_df.reset_index()
    
    # adding moving average
    short_df["MA3M"] = short_df["item_cnt_day"].rolling(window = 3).mean()
    short_df["MA4M"] = short_df["item_cnt_day"].rolling(window = 4).mean()
    short_df["MA5M"] = short_df["item_cnt_day"].rolling(window = 5).mean()
    
    # assing the data to plot
    sales = short_df["item_cnt_day"]
    dates = short_df["date"]
    
    average_3_months = short_df["MA3M"]
    average_4_months = short_df["MA4M"]
    average_5_months = short_df["MA5M"]
    
    # plot the data and add label
    trace1 = go.Scatter(x=dates, y=sales,mode='lines', name = "Monthly sales")
    trace2 = go.Scatter(x=dates, y=average_3_months, mode='lines', name = "Average sales of the last 3 months")
    data = [trace1, trace2]
    
    layout = {"shapes":shapes}

    fig = go.Figure(data=data, layout=layout)
    
       
    # add title and show legend 
    #height=30, width=10
    #layout=layout,
    fig.update_layout(title='Monthly sales of shop {}'.format(iterable), height=600, width=1000)
    #labels={
                        # "dates":"Time grouped by month",
                       #  "sales":'Total Monthly sales of shop {}'.format(iterable)
                    # }
                         
    #######################################################################################
    # Weekly sales
    #######################################################################################
    
    
    # calculate the Weekly sales of each shop
    short_df = df[df["shop_name"] == iterable][["date","item_cnt_day"]]
    short_df["date"] = pd.to_datetime(short_df["date"], format = "%d.%m.%Y")
    short_df["YEAR"] = short_df["date"].dt.year
    short_df = short_df.set_index("date").groupby("YEAR").resample("W")["item_cnt_day"].sum()
    short_df = short_df.reset_index()
    
    # adding moving average
    short_df["MA3W"] = short_df["item_cnt_day"].rolling(window=3).mean()
    short_df["MA4W"] = short_df["item_cnt_day"].rolling(window=4).mean()
    short_df["MA5W"] = short_df["item_cnt_day"].rolling(window=5).mean()
    
    # assing the data to plot
    
    # general sales
    sales = short_df["item_cnt_day"]
    dates = short_df["date"]
    
    average_3_weeks = short_df["MA3W"]
    average_4_weeks = short_df["MA4W"]
    average_5_weeks = short_df["MA5W"]
        
    # add title and show legend
    plt.title('Weekly sales of shop {}'.format(iterable))
    plt.ylabel('Total Weekly sales of shop {}'.format(iterable))
    plt.xlabel("Time grouped by week")
    
    trace3 = go.Scatter(x=dates, y=sales,mode='lines', name = "Weekly sales")
    trace4 = go.Scatter(x=dates, y=average_3_months, mode='lines', name = "Average sales of the last 3 weeks")
    data = [trace1, trace2]
    
    layout = {"shapes":shapes}

    fig = go.Figure(data=data, layout=layout)
    fig.update_layout(title=' sales of shop {}'.format(iterable), height=600, width=1000)
    # general sales
    
    fig.show()

Represent the monthly sales (left plot) and weekly sales (right plot) for each item category

Total sales and the variation on secondary axis

In [None]:
#variation of intradays sales

**Calendar Heatmap**

see the overall activity for a certain period of time per day and per month

In [None]:
df_calendar = df[["date", "item_cnt_day"]] # select columns
df_calendar.set_index("date", inplace = True)


In [None]:
fig = px.imshow(df_calendar)
fig.show()

**Time Series Autocorrelation and Partial Autocorrelation plots: daily sales**

These plots are fundamental in time 

Feature Engineering

![](http://)![](http://)![](http://)