<a href="https://www.kaggle.com/adamgraneto/app-dev-sales-forecasting?scriptVersionId=89538559" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Based on: https://www.kaggle.com/allunia/e-commerce-sales-forecast/notebook
# And: https://github.com/tobyloki/EarthquakePredicteur
# Data Sets Included: 
#    ecommercedata: https://www.kaggle.com/adamgraneto/ecommercedata (from based on kaggle notebook above)
#    fashion-products-on-amazoncom: https://www.kaggle.com/hkhamnakhalid/fashion-products-on-amazoncom
#    Marketing: Electronic Products and Pricing Data: https://www.kaggle.com/arashnic/e-product-pricing
#    Sales Product Data: https://www.kaggle.com/knightbearr/sales-product-data

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

from catboost import CatBoostRegressor, Pool, cv
from catboost import MetricVisualizer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sales-product-data/Sales_August_2019.csv
/kaggle/input/sales-product-data/Sales_May_2019.csv
/kaggle/input/sales-product-data/Sales_February_2019.csv
/kaggle/input/sales-product-data/Sales_November_2019.csv
/kaggle/input/sales-product-data/Sales_January_2019.csv
/kaggle/input/sales-product-data/Sales_March_2019.csv
/kaggle/input/sales-product-data/Sales_September_2019.csv
/kaggle/input/sales-product-data/Sales_April_2019.csv
/kaggle/input/sales-product-data/Sales_July_2019.csv
/kaggle/input/sales-product-data/Sales_October_2019.csv
/kaggle/input/sales-product-data/Sales_June_2019.csv
/kaggle/input/sales-product-data/Sales_December_2019.csv
/kaggle/input/ecommercedata/data.csv
/kaggle/input/ecommercetestdata/Test Data - Sheet1.csv
/kaggle/input/fashion-products-on-amazoncom/amazon_co-ecommerce_sample.csv
/kaggle/input/e-product-pricing/ElectronicsProductsPricingData.csv


# Load in Ecommerce Data

In [2]:
data = pd.read_csv("../input/ecommercedata/data.csv", encoding="ISO-8859-1", dtype={'CustomerID': str})

# Remove Nulls:
data.loc[data.Description.isnull()==False, "lowercase_descriptions"] = data.loc[
    data.Description.isnull()==False,"Description"
].apply(lambda l: l.lower())
data.lowercase_descriptions.dropna().apply(
    lambda l: np.where("nan" in l, True, False)
).value_counts()
data = data.loc[(data.CustomerID.isnull()==False) & (data.lowercase_descriptions.isnull()==False)].copy() 

data.head

<bound method NDFrame.head of        InvoiceNo StockCode                          Description  Quantity  \
0         536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1         536365     71053                  WHITE METAL LANTERN         6   
2         536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3         536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4         536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   
...          ...       ...                                  ...       ...   
541904    581587     22613          PACK OF 20 SPACEBOY NAPKINS        12   
541905    581587     22899         CHILDREN'S APRON DOLLY GIRL          6   
541906    581587     23254        CHILDRENS CUTLERY DOLLY GIRL          4   
541907    581587     23255      CHILDRENS CUTLERY CIRCUS PARADE         4   
541908    581587     22138        BAKING SET 9 PIECE RETROSPOT          3   

            InvoiceDate  UnitPrice CustomerID

In [3]:
data['InvoiceDate']= pd.to_datetime(data['InvoiceDate'])
print(data.dtypes)


data["Revenue"] = data.Quantity * data.UnitPrice

data["Year"] = data.InvoiceDate.dt.year
data["Quarter"] = data.InvoiceDate.dt.quarter
data["Month"] = data.InvoiceDate.dt.month
data["Week"] = data.InvoiceDate.dt.week
data["Weekday"] = data.InvoiceDate.dt.weekday
data["Day"] = data.InvoiceDate.dt.day
data["Dayofyear"] = data.InvoiceDate.dt.dayofyear
data["Date"] = pd.to_datetime(data[['Year', 'Month', 'Day']])

grouped_features = ["Date", "Year", "Quarter","Month", "Week", "Weekday", "Dayofyear", "Day",
                    "StockCode", "Description", "Country"]

InvoiceNo                         object
StockCode                         object
Description                       object
Quantity                           int64
InvoiceDate               datetime64[ns]
UnitPrice                        float64
CustomerID                        object
Country                           object
lowercase_descriptions            object
dtype: object


  # Remove the CWD from sys.path while we load stuff.


In [4]:
daily_data = pd.DataFrame(data.groupby(grouped_features).Quantity.sum(),
                          columns=["Quantity"])
daily_data["Revenue"] = data.groupby(grouped_features).Revenue.sum()
daily_data = daily_data.reset_index()

low_quantity = daily_data.Quantity.quantile(0.01)
high_quantity = daily_data.Quantity.quantile(0.99)

low_revenue = daily_data.Revenue.quantile(0.01)
high_revenue = daily_data.Revenue.quantile(0.99)

daily_data = daily_data.loc[
    (daily_data.Quantity >= low_quantity) & (daily_data.Quantity <= high_quantity)]
daily_data = daily_data.loc[
    (daily_data.Revenue >= low_revenue) & (daily_data.Revenue <= high_revenue)]

week = daily_data.Week.max() - 2

# Feature Engineering

In [5]:
products = pd.DataFrame(index=data.loc[data.Week < week].StockCode.unique(), columns = ["MedianPrice"])

products["MedianPrice"] = data.loc[data.Week < week].groupby("StockCode").UnitPrice.median()
products["MedianQuantities"] = data.loc[data.Week < week].groupby("StockCode").Quantity.median()
products["Customers"] = data.loc[data.Week < week].groupby("StockCode").CustomerID.nunique()
# products["DescriptionLength"] = data.loc[data.Week < week].groupby("StockCode").DescriptionLength.median()
#products["StockCode"] = products.index.values
org_cols = np.copy(products.columns.values)
products.head()

Unnamed: 0,MedianPrice,MedianQuantities,Customers
85123A,2.95,6.0,827
71053,3.75,4.0,138
84406B,4.15,4.0,126
84029G,4.25,4.0,172
84029E,4.25,4.0,177


# Train Model

In [6]:
class Catmodel:
    
    def __init__(self, name, params):
        self.name = name
        self.params = params
    
    def set_data_pool(self, train_pool, val_pool):
        self.train_pool = train_pool
        self.val_pool = val_pool
    
    def set_data(self, X, y, week):
        cat_features_idx = np.where(X.dtypes != np.float)[0]
        x_train, self.x_val = X.loc[X.Week < week], X.loc[X.Week >= week]
        y_train, self.y_val = y.loc[X.Week < week], y.loc[X.Week >= week]
        self.train_pool = Pool(x_train, y_train, cat_features=cat_features_idx)
        self.val_pool = Pool(self.x_val, self.y_val, cat_features=cat_features_idx)
    
    def prepare_model(self):
        self.model = CatBoostRegressor(
                loss_function = self.params.loss[0],
                random_seed = self.params.seed,
                logging_level = 'Silent',
                iterations = self.params.iterations,
                max_depth = self.params.max_depth[0],
                #learning_rate = self.params.learning_rate[0],
                l2_leaf_reg = self.params.l2_leaf_reg[0],
                od_type='Iter',
                od_wait=40,
                train_dir=self.name,
                has_time=True
            )
    
    def learn(self, plot=False):
        self.prepare_model()
        self.model.fit(self.train_pool, eval_set=self.val_pool, plot=plot);
        print("{}, early-stopped model tree count {}".format(
            self.name, self.model.tree_count_
        ))
    
    def score(self):
        return self.model.score(self.val_pool)
    
    def show_importances(self, kind="bar"):
        explainer = shap.TreeExplainer(self.model)
        shap_values = explainer.shap_values(self.val_pool)
        if kind=="bar":
            return shap.summary_plot(shap_values, self.x_val, plot_type="bar")
        return shap.summary_plot(shap_values, self.x_val)
    
    def get_val_results(self):
        self.results = pd.DataFrame(self.y_val)
        self.results["prediction"] = self.predict(self.x_val)
        self.results["error"] = np.abs(
            self.results[self.results.columns.values[0]].values - self.results.prediction)
        self.results["Month"] = self.x_val.Month
        self.results["SquaredError"] = self.results.error.apply(lambda l: np.power(l, 2))
    
    def show_val_results(self):
        self.get_val_results()
        fig, ax = plt.subplots(1,2,figsize=(20,5))
        sns.distplot(self.results.error, ax=ax[0])
        ax[0].set_xlabel("Single absolute error")
        ax[0].set_ylabel("Density")
        self.median_absolute_error = np.median(self.results.error)
        print("Median absolute error: {}".format(self.median_absolute_error))
        ax[0].axvline(self.median_absolute_error, c="black")
        ax[1].scatter(self.results.prediction.values,
                      self.results[self.results.columns[0]].values,
                      c=self.results.error, cmap="RdYlBu_r", s=1)
        ax[1].set_xlabel("Prediction")
        ax[1].set_ylabel("Target")
        return ax
    
    def get_monthly_RMSE(self):
        return self.results.groupby("Month").SquaredError.mean().apply(lambda l: np.sqrt(l))
        
    def predict(self, x):
        return self.model.predict(x)
    
    def get_dependence_plot(self, feature1, feature2=None):
        explainer = shap.TreeExplainer(self.model)
        shap_values = explainer.shap_values(self.val_pool)
        if feature2 is None:
            return shap.dependence_plot(
                feature1,
                shap_values,
                self.x_val,
            )
        else:
            return shap.dependence_plot(
                feature1,
                shap_values,
                self.x_val,
                interaction_index=feature2
            )
        
    def save_model(self, fname):
        return self.model.save_model(fname)

In [7]:
class CatHyperparameter:
    
    def __init__(self,
                 loss="RMSE",
                 metric="RMSE",
                 iterations=1000,
                 max_depth=4,
                 l2_leaf_reg=3,
                 #learning_rate=0.5,
                 seed=0):
        self.loss = loss,
        self.metric = metric,
        self.max_depth = max_depth,
        self.l2_leaf_reg = l2_leaf_reg,
        #self.learning_rate = learning_rate,
        self.iterations=iterations
        self.seed = seed

In [8]:
X = daily_data.drop(["Quantity", "Revenue", "Date"], axis=1)
# X = daily_data.drop(["Revenue", "Date"], axis=1)
y = daily_data.Revenue
params = CatHyperparameter()

model = Catmodel("baseline", params)
model.set_data(X,y, week)
model.learn(plot=True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if sys.path[0] == '':


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

baseline, early-stopped model tree count 173


In [9]:
print(X)

        Year  Quarter  Month  Week  Weekday  Dayofyear  Day StockCode  \
0       2010        4     12    48        2        335    1     10002   
1       2010        4     12    48        2        335    1     10002   
2       2010        4     12    48        2        335    1     10125   
3       2010        4     12    48        2        335    1     10133   
4       2010        4     12    48        2        335    1    15044B   
...      ...      ...    ...   ...      ...        ...  ...       ...   
251293  2011        4     12    49        4        343    9    85123A   
251294  2011        4     12    49        4        343    9     85152   
251295  2011        4     12    49        4        343    9     85173   
251297  2011        4     12    49        4        343    9      POST   
251298  2011        4     12    49        4        343    9      POST   

                               Description         Country  
0              INFLATABLE POLITICAL GLOBE           France  
1

In [10]:
model.score()

0.1678060831657744

### Save Model

In [11]:
# model.save_model('EcommerceModel.cbm')

### Load a single test file and predict

In [12]:
test = pd.read_csv('../input/ecommercetestdata/Test Data - Sheet1.csv')

X_test = pd.DataFrame()
test['InvoiceDate']= pd.to_datetime(test['InvoiceDate'])
# X_test = X_test.append(test, ignore_index=True)

# print(test.loc[0].InvoiceDate)

X_test["Year"] = test.InvoiceDate.dt.year
X_test["Quarter"] = test.InvoiceDate.dt.quarter
X_test["Month"] = test.InvoiceDate.dt.month
X_test["Week"] = test.InvoiceDate.dt.week
X_test["Weekday"] = test.InvoiceDate.dt.weekday
X_test["Dayofyear"] = test.InvoiceDate.dt.dayofyear
X_test["Day"] = test.InvoiceDate.dt.day
X_test["StockCode"] = test.StockCode
X_test["Description"] = test.Description
X_test["Country"] = test.Country


# X_test = X_test.drop(["Quantity", "InvoiceDate"], axis=1)
# X_test = X_test.astype({"UnitPrice": str})
print(X_test)
# y_test = test['time_to_failure'].mean()

   Year  Quarter  Month  Week  Weekday  Dayofyear  Day StockCode  \
0  2022        1      2     9        0         59   28    AAAAAA   
1  2022        1      3     9        1         60    1    123456   
2  2022        1      3     9        2         61    2    987654   

                    Description        Country  
0                 Deck of cards  United States  
1  App Development Club T-shirt  United States  
2                           Pen  United States  


  if sys.path[0] == '':


In [13]:
preds = model.predict(X_test) # m.predict(showcase_x)
print(preds) 

[24.64953339 24.73966528 26.22732573]


In [14]:
# Making prediction 
preds = model.predict(X_test) # m.predict(showcase_x)

print(preds) 
# print(y_test) # print(showcase_y.average)

[24.64953339 24.73966528 26.22732573]
