In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb
import datetime

In [53]:
filename = '../inputs/Maltas.csv'
df = pd.read_csv(filename)
df =df[df['Quantity'] > 0]

new_matrix_test = df.drop_duplicates(['Account_id',	'Product_id', 'Category'])



In [64]:
new_matrix_test['Date'] = (pd.to_datetime(df.Date.max()) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_matrix_test['Date'] = (pd.to_datetime(df.Date.max()) + datetime.timedelta(days=1)).strftime('%Y-%m-%d')


In [69]:
new_matrix_test.head()

Unnamed: 0,Date,Account_id,Product_id,Category,Quantity
0,2022-07-31,33237477,15354,Maltas,5
1,2022-07-31,33236337,15354,Maltas,2
2,2022-07-31,38169687,11910,Maltas,11
3,2022-07-31,38169687,15354,Maltas,1
4,2022-07-31,38169666,8270,Maltas,20


In [73]:
new_matrix_test = new_matrix_test.drop(columns='Quantity')

In [4]:
train, test = train_test_split(df, test_size=0.30)
train = train.sort_values('Date')
test = test.sort_values('Date')

In [5]:
train.shape

(14117331, 5)

In [6]:
test.shape

(6050285, 5)

In [7]:
def date_features(df: pd.DataFrame) -> pd.DataFrame:
    """Import the DataFrame to be able to extract the data characteristics and transform

    Arguments:
        df: data frame

    Return:
        Return a DataFrame with its date characteristics
    """

    df['date'] = pd.to_datetime(df['Date'])
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['day'] = df.date.dt.day
    df['dayofyear'] = df.date.dt.dayofyear
    df['dayofweek'] = df.date.dt.dayofweek
    df['weekofyear'] = df.date.dt.isocalendar().week
    
    # Additionnal Data Features
    df['day^year'] = np.log((np.log(df['dayofyear'] + 1)) ** (df['year'] - 2000))
    
    # Drop date
    df.drop('Date', axis=1, inplace=True)
    
    return df

# Dates Features for Train, Test
train = date_features(train)
test = date_features(test)

In [74]:
new_matrix_test = date_features(new_matrix_test)

In [8]:
train.head()

Unnamed: 0,Account_id,Product_id,Category,Quantity,date,year,month,day,dayofyear,dayofweek,weekofyear,day^year
24973,34254285,8260,Maltas,1,2022-01-03,2022,1,3,3,0,1,7.185954
26024,34259529,15354,Maltas,20,2022-01-03,2022,1,3,3,0,1,7.185954
26326,33226350,8270,Maltas,1,2022-01-03,2022,1,3,3,0,1,7.185954
12407,33253035,15354,Maltas,50,2022-01-03,2022,1,3,3,0,1,7.185954
19083,33251433,11910,Maltas,4,2022-01-03,2022,1,3,3,0,1,7.185954


In [9]:
test.head()

Unnamed: 0,Account_id,Product_id,Category,Quantity,date,year,month,day,dayofyear,dayofweek,weekofyear,day^year
22716,33221916,15354,Maltas,1,2022-01-03,2022,1,3,3,0,1,7.185954
7691,33224859,11910,Maltas,3,2022-01-03,2022,1,3,3,0,1,7.185954
11286,36839568,11910,Maltas,4,2022-01-03,2022,1,3,3,0,1,7.185954
25101,33231072,8262,Maltas,1,2022-01-03,2022,1,3,3,0,1,7.185954
30034,33238242,8262,Maltas,1,2022-01-03,2022,1,3,3,0,1,7.185954


In [10]:
# Daily Average, Monthly Average for train
train['daymonth_avg']  = train.groupby(['Product_id','Account_id', 'dayofweek'])['Quantity'].transform('mean')
train['monthly_avg'] = train.groupby(['Product_id','Account_id','month'])['Quantity'].transform('mean')
train = train.dropna()

# Average sales for Day_of_week = d per 'Category', 'Account_id'
daymonth_avg = train.groupby(['Product_id','Account_id','dayofweek'])['Quantity'].mean().reset_index()

# Average sales for Month = m per 'Category','Account_id' 
monthly_avg = train.groupby(['Product_id','Account_id','month'])['Quantity'].mean().reset_index()



# Merge Test with Daily Avg, Monthly Avg
def merge(df1: pd.DataFrame, df2: pd.DataFrame, col: list, col_name: str) -> pd.DataFrame:
    """This is function to do merge whit DataFrames and strings what representation a new DataFrame

    Args: 
        df1: pd.DataFrame is a df of test
        df2: pd.DataFrame is a df of average sales for day of week or month
        col: list with features of test
        col_name: str with name of column per feature of week or month 
    Return: 
          The return value is a pd.DataFrame with features news
    """
    
    df1 =pd.merge(df1, 
                  df2, 
                  how='left', 
                  on=None, 
                  left_on=col, 
                  right_on=col,
                  left_index=False, 
                  right_index=False, 
                  sort=True,
                  copy=True, 
                  indicator=False)
    
    df1 = df1.rename(columns={'sales':col_name})
    return df1


In [11]:
# Add Daily_avg and Monthly_avg features to test 
test = merge(test, daymonth_avg, ['Product_id','Account_id','dayofweek'],'daymonth_avg')
test = merge(test, monthly_avg, ['Product_id','Account_id','month'],'monthly_avg')

In [12]:
# Sales Rolling mean sequence per item 

rolling_10 = train.groupby(['Product_id'])['Quantity'].rolling(3).mean().reset_index().drop('level_1', axis=1)
train['rolling_mean'] = rolling_10['Quantity'] 

In [13]:

rolling_10.head()

Unnamed: 0,Product_id,Quantity
0,8260,
1,8260,
2,8260,1.333333
3,8260,6.0
4,8260,7.333333


In [14]:
# 90 last days of training rolling mean sequence added to test data
rolling_last90 = train.groupby(['Product_id','Quantity'])['rolling_mean'].tail(90).copy()
test['rolling_mean'] = rolling_last90.reset_index().drop('index', axis=1)

In [15]:
rolling_last90.head()

13467    1.666667
33546    2.000000
26972    7.333333
33164    2.000000
34061    7.666667
Name: rolling_mean, dtype: float64

In [16]:

# Shifting rolling mean 3 months
train['rolling_mean'] = train.groupby(['Product_id'])['rolling_mean'].shift(90) # Create a feature with rolling mean of day - 90

In [17]:
test.head()

Unnamed: 0,Account_id,Product_id,Category,Quantity_x,date,year,month,day,dayofyear,dayofweek,weekofyear,day^year,Quantity_y,Quantity,rolling_mean
0,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,1.666667
1,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,2.0
2,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,7.333333
3,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,2.0
4,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,7.666667


In [18]:
test = test.fillna(0)
test.head()

Unnamed: 0,Account_id,Product_id,Category,Quantity_x,date,year,month,day,dayofyear,dayofweek,weekofyear,day^year,Quantity_y,Quantity,rolling_mean
0,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,1.666667
1,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,2.0
2,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,7.333333
3,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,2.0
4,33217773,8260,Maltas,20,2022-01-04,2022,1,4,4,1,1,10.46947,21.123552,18.240223,7.666667


In [19]:
# Clean features highly correlated to each others
for df in [train, test]:
    df.drop(['dayofyear', 
             'weekofyear',
             'day',
             'month',
             'Category',
             'Account_id'],
                axis=1, 
                inplace=True)

In [20]:
# Features Scaling (except sales)
sales_series, id_series = train['Quantity'], test['Product_id']

In [23]:
# Retrieve actual Sales values and ID
train['Quantity'] = sales_series
test['Product_id'] = id_series

In [24]:
# Training Data
X_train = train.drop('Quantity', axis=1).dropna()
y_train = train['Quantity']

In [25]:
# Test Data
test.sort_values(by=['Product_id'], inplace=True)
X_test = test.drop('Product_id', axis=1)

In [36]:
#df = train
df_train = train.copy()


In [37]:
df_train = df_train.drop(columns='date').fillna(0)

In [38]:
# Train Test Split
X_train , X_test ,y_train, y_test = train_test_split(df_train.drop('Quantity',axis=1),df_train.pop('Quantity'), random_state=123, test_size=0.2)

In [46]:
X_test

Unnamed: 0,Product_id,year,dayofweek,day^year,daymonth_avg,monthly_avg,rolling_mean
1470026,11910,2022,4,21.917036,2.802469,5.000000,8.666667
19134057,8270,2022,2,36.577186,3.039013,2.535211,0.000000
2405376,11910,2022,5,25.141310,3.000000,3.000000,1.000000
10958205,11910,2022,3,33.829808,4.367816,3.878125,10.000000
17114772,8260,2022,4,35.971890,6.062136,10.000000,0.000000
...,...,...,...,...,...,...,...
1406214,15354,2022,4,21.917036,3.615385,2.562500,10.000000
340294,15354,2022,3,14.646056,2.403329,3.975610,2.000000
16457261,8262,2022,3,35.762929,3.944118,5.000000,0.000000
18432854,15354,2022,5,36.333589,7.766180,6.363636,0.000000


In [47]:
y_test

1470026      5
19134057     2
2405376      3
10958205     2
17114772    10
            ..
1406214      3
340294       3
16457261     5
18432854    10
17654271    60
Name: Quantity, Length: 2823467, dtype: int64

In [39]:
# XGB Model
matrix_train = xgb.DMatrix(X_train, label = y_train)
matrix_test = xgb.DMatrix(X_test, label = y_test)

In [40]:

# Run XGB 
model = xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                ,dtrain = matrix_train, num_boost_round = 500, 
                early_stopping_rounds = 20, evals = [(matrix_test,'test')],)

[0]	test-mae:6.78741
[1]	test-mae:5.38855
[2]	test-mae:4.60710
[3]	test-mae:4.18428
[4]	test-mae:3.93411
[5]	test-mae:3.77558
[6]	test-mae:3.69255
[7]	test-mae:3.63781
[8]	test-mae:3.60706
[9]	test-mae:3.57620
[10]	test-mae:3.55713
[11]	test-mae:3.54388
[12]	test-mae:3.53632
[13]	test-mae:3.52197
[14]	test-mae:3.52070
[15]	test-mae:3.51887
[16]	test-mae:3.51681
[17]	test-mae:3.51077
[18]	test-mae:3.50775
[19]	test-mae:3.50658
[20]	test-mae:3.49163
[21]	test-mae:3.48372
[22]	test-mae:3.47817
[23]	test-mae:3.46637
[24]	test-mae:3.45843
[25]	test-mae:3.44600
[26]	test-mae:3.44380
[27]	test-mae:3.43406
[28]	test-mae:3.42451
[29]	test-mae:3.41058
[30]	test-mae:3.40052
[31]	test-mae:3.39484
[32]	test-mae:3.38824
[33]	test-mae:3.38247
[34]	test-mae:3.37569
[35]	test-mae:3.37134
[36]	test-mae:3.37190
[37]	test-mae:3.36611
[38]	test-mae:3.36300
[39]	test-mae:3.35617
[40]	test-mae:3.35111
[41]	test-mae:3.34605
[42]	test-mae:3.33872
[43]	test-mae:3.33151
[44]	test-mae:3.32704
[45]	test-mae:3.3220

In [44]:
model.predict(matrix_test)

array([ 4.45919  ,  2.5277567,  3.1219506, ...,  4.8153048,  7.0646076,
       35.452053 ], dtype=float32)

In [87]:
model.predict()

TypeError: Booster.predict() missing 1 required positional argument: 'data'