In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib.dates as mdates
from matplotlib import colors

from datetime import datetime, timedelta
from time import time
from uuid import uuid4
from scipy.ndimage import convolve1d

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("training_dataset.csv")
df.head(2)

Unnamed: 0.1,Unnamed: 0,observation_id,observation_timestamp,hour_of_day,register__sales_dollar_amt_this_hour,register__payment_types_accepted,register__peak_sales_dollar_amt_per_hour,register__sales_dollar_amt_last_hour,register__sales_quantity_last_hour,register__sales_quantity_rescanned_frac,...,region__sales_dollar_amt_last_hour,region__returns_dollar_amt_last_hour,region__nighttime_open_registers,region__nighttime_service_time_per_customer,region__nighttime_sales_amt_per_hour,region__nighttime_returns_amt_per_hour,region__peak_sales_dollar_amt_per_hour,region__peak_sales_dollar_amt_per_hour_v2,region__peak_returns_dollar_amt_per_hour,region__peak_returns_dollar_amt_per_hour_v2
0,0,704d2a80-d52e-11ec-90ff-c7e6292284b3,2022-05-16 15:39:57,15,347.29,Cash+Credit,-0.7383,-0.127,-0.1993,-0.8299,...,-0.692,-0.4605,-0.518,-1.0062,-0.6462,-0.603,-0.4773,0.1748,-1.7951,-0.8284
1,1,1cacc1d0-e6ac-11ec-b65d-156af70ce36b,2022-06-07 21:52:23,21,361.59,Cash+Credit,0.6483,-0.0362,-0.0777,-0.7395,...,-0.6531,-0.4434,-0.6498,0.9031,-0.6493,-0.6106,0.4998,-0.9816,0.8939,-0.8614


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18512 entries, 0 to 18511
Data columns (total 63 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Unnamed: 0                                   18512 non-null  int64  
 1   observation_id                               18512 non-null  object 
 2   observation_timestamp                        18512 non-null  object 
 3   hour_of_day                                  18512 non-null  int64  
 4   register__sales_dollar_amt_this_hour         18512 non-null  float64
 5   register__payment_types_accepted             18512 non-null  object 
 6   register__peak_sales_dollar_amt_per_hour     18512 non-null  float64
 7   register__sales_dollar_amt_last_hour         18512 non-null  float64
 8   register__sales_quantity_last_hour           18512 non-null  float64
 9   register__sales_quantity_rescanned_frac      18512 non-null  float64
 10

In [4]:
missing_values= df.isnull().sum()

In [5]:
print(missing_values[missing_values>0])

region__peak_sales_dollar_amt_per_hour_v2       198
region__peak_returns_dollar_amt_per_hour_v2    1900
dtype: int64


In [6]:
df['region__peak_sales_dollar_amt_per_hour_v2'].fillna(df['region__peak_sales_dollar_amt_per_hour_v2'].mean(), inplace= True)
df['region__peak_returns_dollar_amt_per_hour_v2'].fillna(df['region__peak_returns_dollar_amt_per_hour_v2'].mean(), inplace= True)

In [7]:
df.columns[df.isna().any()]

Index([], dtype='object')

In [8]:
df.dtypes[df.dtypes == 'object']

observation_id                      object
observation_timestamp               object
register__payment_types_accepted    object
store__type_code                    object
dtype: object

In [9]:
df.observation_timestamp.value_counts()

2022-05-19 20:42:58    2
2022-05-03 18:37:18    2
2022-05-14 04:33:50    2
2022-05-17 23:49:34    2
2022-05-13 16:04:43    2
                      ..
2022-06-06 16:45:27    1
2022-05-14 23:55:36    1
2022-06-01 21:37:47    1
2022-06-08 01:36:11    1
2022-06-22 20:02:32    1
Name: observation_timestamp, Length: 18460, dtype: int64

In [10]:
df.register__payment_types_accepted.value_counts()

Cash+Credit          16310
Cash+Credit+Check     2133
Credit                  69
Name: register__payment_types_accepted, dtype: int64

In [11]:
df.store__type_code.value_counts()

A    11691
C     5637
B      861
D      316
F        5
E        2
Name: store__type_code, dtype: int64

In [12]:
df.observation_timestamp = df.observation_timestamp.astype('datetime64[ns]')

In [13]:
df.dtypes[df.dtypes == 'object']

observation_id                      object
register__payment_types_accepted    object
store__type_code                    object
dtype: object

In [14]:
cols = ['register__payment_types_accepted', 'store__type_code']

In [15]:
df_n = pd.get_dummies(df, columns= cols)

In [16]:
df_n.drop(['observation_id' , 'Unnamed: 0'], axis = 1, inplace= True)

In [17]:
df_n.dtypes[df_n.dtypes == 'object']

Series([], dtype: object)

In [18]:
df_n.shape

(18512, 68)

In [19]:
df_t = pd.read_csv("test_dataset.csv")
df_t.head(2)

Unnamed: 0,observation_id,observation_timestamp,hour_of_day,register__payment_types_accepted,register__peak_sales_dollar_amt_per_hour,register__sales_dollar_amt_last_hour,register__sales_quantity_last_hour,register__sales_quantity_rescanned_frac,register__sales_payments_declined_frac,register__peak_returns_dollar_amt_per_hour,...,region__sales_dollar_amt_last_hour,region__returns_dollar_amt_last_hour,region__nighttime_open_registers,region__nighttime_service_time_per_customer,region__nighttime_sales_amt_per_hour,region__nighttime_returns_amt_per_hour,region__peak_sales_dollar_amt_per_hour,region__peak_sales_dollar_amt_per_hour_v2,region__peak_returns_dollar_amt_per_hour,region__peak_returns_dollar_amt_per_hour_v2
0,3f483640-bc52-11ec-b736-8544dc068949,15/04/2022 00:23,0,Cash+Credit,0.5693,-0.1253,-0.1489,-0.8176,-0.1066,0.9103,...,-0.6373,-0.3791,-0.5581,-0.7148,-0.6487,-0.6091,1.002,-0.5479,0.8761,
1,a88b0bb0-d2ae-11ec-bd0e-f5a7c7895456,13/05/2022 11:20,11,Cash+Credit,0.3084,-0.127,-0.1986,1.6562,-0.1247,-0.5251,...,-0.6483,-0.4419,-0.6498,0.8867,-0.6493,-0.6106,-1.9779,-0.9816,-0.1311,-0.8614


In [20]:
missing_values_t= df_t.isnull().sum()

In [21]:
print(missing_values_t[missing_values_t>0])

region__peak_sales_dollar_amt_per_hour_v2      100
region__peak_returns_dollar_amt_per_hour_v2    568
dtype: int64


In [22]:
df_t['region__peak_sales_dollar_amt_per_hour_v2'].fillna(df_t['region__peak_sales_dollar_amt_per_hour_v2'].mean(), inplace= True)
df_t['region__peak_returns_dollar_amt_per_hour_v2'].fillna(df_t['region__peak_returns_dollar_amt_per_hour_v2'].mean(), inplace= True)

In [23]:
df_t.columns[df_t.isna().any()]

Index([], dtype='object')

In [24]:
df_t.dtypes[df_t.dtypes == 'object']

observation_id                      object
observation_timestamp               object
register__payment_types_accepted    object
store__type_code                    object
dtype: object

In [25]:
df_t.observation_timestamp = df_t.observation_timestamp.astype('datetime64[ns]')

In [26]:
df_nt = pd.get_dummies(df_t, columns= cols)

In [27]:
df_nt.drop('observation_id', axis = 1, inplace= True)

In [28]:
df_nt.dtypes[df_nt.dtypes == 'object']

Series([], dtype: object)

In [29]:
df_nt.shape

(5179, 67)

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
y = df_n.register__sales_dollar_amt_this_hour
X = df_n.drop(['register__sales_dollar_amt_this_hour'], axis =1)

In [32]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size= 0.4, random_state= 42)

you must use error metrics specifically designed for evaluating predictions made on regression problems

In [33]:
from sklearn.metrics import make_scorer, r2_score 
from sklearn.model_selection import cross_validate

In [34]:
def scores(model, title = "Default"):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    sc = r2_score(y_test, pred)
    print("r2 score: {}".format(sc))

In [35]:
from catboost import CatBoostRegressor
catbr = CatBoostRegressor(verbose= 0, n_estimators= 100)
scores(catbr, "Cat Boosting")

r2 score: 0.9468827190522411


In [36]:
target = catbr.predict(df_nt)
d = pd.DataFrame(target)
#d.index = df_nt.id
d.columns = ['prediction']
d.to_csv('submission.csv', index= False)

In [37]:
d.head()

Unnamed: 0,prediction
0,1726.509946
1,363.218949
2,1397.184138
3,242.177765
4,370.895325
