In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
filename = '../inputs/Lacteos.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Date,Account_id,Product_id,Category,Quantity
0,2022-01-03,34685910,34254,Lacteos,1
1,2022-01-03,33223287,34244,Lacteos,2
2,2022-01-03,38095128,34244,Lacteos,3
3,2022-01-03,38095128,34374,Lacteos,1
4,2022-01-03,33229395,34244,Lacteos,25


In [3]:
train, test = train_test_split(df, test_size=0.30)

In [4]:
train.shape

(1154484, 5)

In [5]:
test.shape

(494780, 5)

In [6]:
def date_features(df: pd.DataFrame) -> pd.DataFrame:
    """Import the DataFrame to be able to extract the data characteristics and transform

    Arguments:
        df: data frame

    Return:
        Return a DataFrame with its date characteristics
    """

    df['date'] = pd.to_datetime(df['Date'])
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['day'] = df.date.dt.day
    df['dayofyear'] = df.date.dt.dayofyear
    df['dayofweek'] = df.date.dt.dayofweek
    df['weekofyear'] = df.date.dt.weekofyear
    
    # Additionnal Data Features
    df['day^year'] = np.log((np.log(df['dayofyear'] + 1)) ** (df['year'] - 2000))
    
    # Drop date
    df.drop('Date', axis=1, inplace=True)
    
    return df

# Dates Features for Train, Test
train = date_features(train)
test = date_features(test)

  df['weekofyear'] = df.date.dt.weekofyear
  df['weekofyear'] = df.date.dt.weekofyear


In [7]:
train.head()

Unnamed: 0,Account_id,Product_id,Category,Quantity,date,year,month,day,dayofyear,dayofweek,weekofyear,day^year
321637,33235134,34244,Lacteos,5,2022-03-29,2022,3,29,88,1,13,33.034077
317369,33751140,34244,Lacteos,1,2022-03-28,2022,3,28,87,0,13,32.978625
736183,33227130,34244,Lacteos,2,2022-05-10,2022,5,10,130,1,19,34.851533
426941,38924232,34244,Lacteos,1,2022-04-08,2022,4,8,98,4,14,33.549886
687298,34399230,34246,Lacteos,1,2022-05-04,2022,5,4,124,2,18,34.63894


In [8]:
test.head()

Unnamed: 0,Account_id,Product_id,Category,Quantity,date,year,month,day,dayofyear,dayofweek,weekofyear,day^year
1003423,33255837,34244,Lacteos,1,2022-05-31,2022,5,31,151,1,22,35.512458
1291372,33250254,34250,Lacteos,1,2022-06-21,2022,6,21,172,1,25,36.071984
210893,37309275,34246,Lacteos,1,2022-03-04,2022,3,4,63,4,9,31.355424
1401277,39893418,34244,Lacteos,2,2022-06-29,2022,6,29,180,2,26,36.26413
1426566,37599954,34244,Lacteos,1,2022-07-02,2022,7,2,183,5,26,36.333589


In [9]:
# Daily Average, Monthly Average for train
train['daymonth_avg']  = train.groupby(['Product_id','Account_id', 'dayofweek'])['Quantity'].transform('mean')
train['monthly_avg'] = train.groupby(['Product_id','Account_id','month'])['Quantity'].transform('mean')
train = train.dropna()

# Average sales for Day_of_week = d per 'Category', 'Account_id'
daymonth_avg = train.groupby(['Product_id','Account_id','dayofweek'])['Quantity'].mean().reset_index()

# Average sales for Month = m per 'Category','Account_id' 
monthly_avg = train.groupby(['Product_id','Account_id','month'])['Quantity'].mean().reset_index()



# Merge Test with Daily Avg, Monthly Avg
def merge(df1: pd.DataFrame, df2: pd.DataFrame, col: list, col_name: str) -> pd.DataFrame:
    """This is function to do merge whit DataFrames and strings what representation a new DataFrame

    Args: 
        df1: pd.DataFrame is a df of test
        df2: pd.DataFrame is a df of average sales for day of week or month
        col: list with features of test
        col_name: str with name of column per feature of week or month 
    Return: 
          The return value is a pd.DataFrame with features news
    """
    
    df1 =pd.merge(df1, 
                  df2, 
                  how='left', 
                  on=None, 
                  left_on=col, 
                  right_on=col,
                  left_index=False, 
                  right_index=False, 
                  sort=True,
                  copy=True, 
                  indicator=False)
    
    df1 = df1.rename(columns={'sales':col_name})
    return df1


In [11]:

# Add Daily_avg and Monthly_avg features to test 
test = merge(test, daymonth_avg, ['Product_id','Account_id','dayofweek'],'daymonth_avg')
test = merge(test, monthly_avg, ['Product_id','Account_id','month'],'monthly_avg')

# Sales Rolling mean sequence per item 
rolling_10 = train.groupby(['Product_id'])['Quantity'].rolling(10).mean().reset_index().drop('level_1', axis=1)
train['rolling_mean'] = rolling_10['Quantity'] 

# 90 last days of training rolling mean sequence added to test data
rolling_last90 = train.groupby(['Product_id','Quantity'])['rolling_mean'].tail(90).copy()
test['rolling_mean'] = rolling_last90.reset_index().drop('index', axis=1)

# Shifting rolling mean 3 months
train['rolling_mean'] = train.groupby(['Product_id'])['rolling_mean'].shift(90) # Create a feature with rolling mean of day - 90
train.head()

Unnamed: 0,Account_id,Product_id,Category,Quantity,date,year,month,day,dayofyear,dayofweek,weekofyear,day^year,daymonth_avg,monthly_avg,rolling_mean
321637,33235134,34244,Lacteos,5,2022-03-29,2022,3,29,88,1,13,33.034077,2.936508,5.0,
317369,33751140,34244,Lacteos,1,2022-03-28,2022,3,28,87,0,13,32.978625,1.0,1.0,
736183,33227130,34244,Lacteos,2,2022-05-10,2022,5,10,130,1,19,34.851533,1.677966,2.0,
426941,38924232,34244,Lacteos,1,2022-04-08,2022,4,8,98,4,14,33.549886,21.532646,1.0,
687298,34399230,34246,Lacteos,1,2022-05-04,2022,5,4,124,2,18,34.63894,1.0,1.0,


In [20]:
test.head()

Unnamed: 0,Product_id,Quantity_x,date,year,dayofweek,day^year,Quantity_y,Quantity,rolling_mean
0,34244,1,2022-07-21,2022,3,36.744299,1.0,1.0,1.0
1,34244,1,2022-07-21,2022,3,36.744299,1.0,1.0,1.0
2,34244,1,2022-07-21,2022,3,36.744299,1.0,1.0,1.9
3,34244,1,2022-07-21,2022,3,36.744299,1.0,1.0,1.4
4,34244,1,2022-07-21,2022,3,36.744299,1.0,1.0,6.1


In [13]:
# Clean features highly correlated to each others
for df in [train, test]:
    df.drop(['dayofyear', 
             'weekofyear',
             'day',
             'month',
             'Category',
             'Account_id'],
                axis=1, 
                inplace=True)

In [23]:
# Features Scaling (except sales)
sales_series, id_series = train['Quantity'], test['Product_id']

# Features Scaling
train = ((train - train.mean()) / train.std())
test = ((test - test.mean()) / test.std())

# Retrieve actual Sales values and ID
train['Quantity'] = sales_series
test['Product_id'] = id_series

# Training Data
X_train = train.drop('Quantity', axis=1).dropna()
y_train = train['Quantity']

# Test Data
test.sort_values(by=['Product_id'], inplace=True)
X_test = test.drop('Product_id', axis=1)

#df = train
df_train = train.copy()

# Train Test Split
X_train , X_test ,y_train, y_test = train_test_split(df_train.drop('Quantity',axis=1),df_train.pop('Quantity'), random_state=123, test_size=0.2)

# XGB Model
matrix_train = xgb.DMatrix(X_train, label = y_train)
matrix_test = xgb.DMatrix(X_test, label = y_test)

# Run XGB 
model = xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                ,dtrain = matrix_train, num_boost_round = 500, 
                early_stopping_rounds = 20, evals = [(matrix_test,'test')],)

  train = ((train - train.mean()) / train.std())


UFuncTypeError: ufunc 'subtract' cannot use operands with types dtype('<M8[ns]') and dtype('float64')