# Load libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing


from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score,accuracy_score,log_loss
from xgboost import XGBRegressor
import lightgbm as lgb
#optimizer 

from functools import partial
import optuna



# Load Data

In [None]:
train_df= pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv',index_col='row_id')
test_df = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv',index_col ='row_id')

In [None]:
train_df.head()

# Explanatory Data Analysis and Preprocess

In [None]:
# change date datatype to datatime 
train_df['date'] = pd.to_datetime(train_df.date, format = "%Y-%m-%d")
test_df['date'] = pd.to_datetime(test_df.date, format = "%Y-%m-%d")

In [None]:
#As many people discussing about GDP per capita, I am going to use the data.
#https://www.macrotrends.net/countries/NOR/norway/gdp-per-capita
#USD per capita and growth rate

Sweden_ec = {2015:[51545,-.1412],2016:[51965,.0081],2017:[53792,.0351],2018:[54589,.0148],2019:[51687,-.0532]}

Finland_ec = {2015:[42802,-.1495],2016:[43814,.0236],2017:[46412,.0593],2018:[50038,.0781],2019:[48712,-.0265]}

Norway_ec = {2015:[74356,-.2336],2016:[70461,-.0524],2017:[75497,0.0715],2018:[82268,.0897],2019:[75826,-.0783]}


In [None]:
train_df['GDPperCapita'] = [Sweden_ec[a.year][0] if b =='Sweden' else(Finland_ec[a.year][0] if b =='Finland' else Norway_ec[a.year][0]) for a,b in zip(train_df.date,train_df.country)]
train_df['GrowthRate']  = [Sweden_ec[a.year][1] if b =='Sweden' else(Finland_ec[a.year][1] if b =='Finland' else Norway_ec[a.year][1]) for a,b in zip(train_df.date,train_df.country)]

In [None]:
test_df['GDPperCapita'] = [Sweden_ec[a.year][0] if b =='Sweden' else(Finland_ec[a.year][0] if b =='Finland' else Norway_ec[a.year][0]) for a,b in zip(test_df.date,test_df.country)]
test_df['GrowthRate']  = [Sweden_ec[a.year][1] if b =='Sweden' else(Finland_ec[a.year][1] if b =='Finland' else Norway_ec[a.year][1]) for a,b in zip(test_df.date,test_df.country)]

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

train_df[['GDPperCapita','GrowthRate']] = scaler.fit_transform(train_df[['GDPperCapita','GrowthRate']])
test_df[['GDPperCapita','GrowthRate']] = scaler.transform(test_df[['GDPperCapita','GrowthRate']])

In [None]:
#Add day of week Monday:0 Sunday:6

dayOfWeek={'Monday':0, 'Tuesday':1, 'Wednesday':2, 'Thursday':3, 'Friday':4, 'Saturday':5, 'Sunday':6}
train_df['day_of_week'] = train_df['date'].dt.day_name().map(dayOfWeek)
test_df['day_of_week'] = test_df['date'].dt.day_name().map(dayOfWeek)

In [None]:
train_df['weekend'] = [1 if a in [5,6] else 0 for a in train_df['day_of_week']]
test_df['weekend'] = [1 if a in [5,6] else 0 for a in test_df['day_of_week']]


In [None]:
daily_average_sale = train_df.groupby(by='date').num_sold.mean()

In [None]:
fig = plt.figure(figsize=(50,10))
plt.bar(daily_average_sale.index,daily_average_sale.values,color=(0.1,0.1,0.1,0.1),edgecolor='blue')


plt.title('Daily Average Sales')
plt.ylabel('Number of Sales')

In [None]:
objects =['country','store','product']

for i in objects:
     print(train_df[i].unique())
print('\n')
for i in objects:
    print(test_df[i].unique())


In [None]:
Finland_KaggleMart = train_df[(train_df.country=='Finland') & (train_df.store =='KaggleMart')]

In [None]:
Finland_KaggleMart

In [None]:
def make_sales_graphs(country,store,product):
    df1 = train_df[(train_df.country== country) & (train_df.store == store)]
    
    df2 = df1[df1['product'] == product]   
    fig = plt.figure(figsize=(50,10))
    plt.bar(df2.date,df2.num_sold)
    fig.suptitle( product + ' Sales - ' + country +' ' + store + ' (2015-2018)',fontsize=20) 
    plt.show()
    
    

In [None]:
make_sales_graphs('Finland','KaggleMart','Kaggle Mug')

In [None]:
countries = ['Finland', 'Norway', 'Sweden']
stores = ['KaggleMart', 'KaggleRama']
products = ['Kaggle Mug' ,'Kaggle Hat', 'Kaggle Sticker']

# Finland Graphs

In [None]:
for i in stores:
    for j in products:
        make_sales_graphs('Finland',i,j)

# Insights from Finland

* There are sales peaks every end of year for the three items. 
* Seasons affect Hat sales signigicantly and mug sales slightly. Sticker sales seems stable over the year.  
* Three item sales seemed to be affected by the day of week.

****************************************************************************************************

# Norway Graphs

In [None]:
for i in stores:
    for j in products:
        make_sales_graphs('Norway',i,j)

# Insight from Norway

Norway market has the same tendency of Finland
*****************************************************************

# Sweden Graphs

In [None]:
for i in stores:
    for j in products:
        make_sales_graphs('Sweden',i,j)

# Insight from Sweden

Sweden market has the same tendency of Finland.
*****************************************************************

# Three countires have the same tendency on their sales
* There are sales peaks every end of year for the three items. 
* Seasons affect Hat sales signigicantly and mug sales slightly. Sticker sales seems stable over the year.  
* Three item sales seemed to be affected by the day of week.

****************************************************************************************************

# Day of Week affects the Sales?

In [None]:
daily_sale = train_df.groupby(by='day_of_week').mean()
plt.bar(np.arange(len(daily_sale)),daily_sale['num_sold'].to_list(),color=(0.1,0.1,0.1,0.1),edgecolor='blue')

plt.xticks(np.arange(len(daily_sale)),['Mon','Tue','Wed','Thu','Fri','Sat','Sun'])

plt.title('Sales by Day of Week')
plt.ylabel('Number of Sales')


<font size =5 >More customers comes to the stores weekends. </font>
************************************************************************

# Month affects the Sales?

In [None]:
train_df['day'] = pd.DatetimeIndex(train_df.date).day
test_df['day'] = pd.DatetimeIndex(test_df.date).day

train_df['month'] = pd.DatetimeIndex(train_df.date).month
test_df['month'] = pd.DatetimeIndex(test_df.date).month

train_df['year'] = pd.DatetimeIndex(train_df.date).year
test_df['year'] = pd.DatetimeIndex(test_df.date).year

In [None]:
monthly_sale = train_df.groupby(by='month').mean()
plt.bar(np.arange(len(monthly_sale)),monthly_sale['num_sold'].to_list(),color=(0.1,0.1,0.1,0.1),edgecolor='blue')

plt.xticks(np.arange(len(monthly_sale)),range(1,13))

plt.title('Sales by Month')
plt.ylabel('Number of Sales')

# Top Sales Day

* <font size =4>
 From 12/27 to 1/3 is the busiest season for the shops  
</font>

In [None]:
#
print(train_df.groupby(by=['month','day']).sum()['num_sold'].sort_values(ascending=False)[:10])
print(train_df.query('year==2015').groupby(by='date').sum()['num_sold'].sort_values(ascending=False)[:10])
print(train_df.query('year==2016').groupby(by='date').sum()['num_sold'].sort_values(ascending=False)[:10])
print(train_df.query('year==2017').groupby(by='date').sum()['num_sold'].sort_values(ascending=False)[:10])
print(train_df.query('year==2018').groupby(by='date').sum()['num_sold'].sort_values(ascending=False)[:10])

In [None]:
# for each item
print('Hat Top Ten Sales Day')
print(train_df.query('product=="Kaggle Hat"').groupby(by=['month','day']).sum()['num_sold'].sort_values(ascending=False)[:10])
print("\n")
print("Mug Top Ten Sales Day")
print(train_df.query('product=="Kaggle Mug"').groupby(by=['month','day']).sum()['num_sold'].sort_values(ascending=False)[:10])
print("\n")
print(print("Mug Top Ten Sales Day"))
print(train_df.query('product=="Kaggle Sticker"').groupby(by=['month','day']).sum()['num_sold'].sort_values(ascending=False)[:10])



* <font size =4>
 For each item, they sells well on the peak time. Hat also sells well in Spring time(August and May)    
</font>

In [None]:
train_df.date[0].strftime('%m-%d') <"01-02"

In [None]:
train_df['busiest'] =[1 if a.strftime('%m-%d')>='12-27' or a.strftime('%m-%d')<='1-3' else 0 for a in train_df.date]
test_df['busiest'] =[1 if a.strftime('%m-%d')>='12-27' or a.strftime('%m-%d')<='1-3' else 0 for a in test_df.date]

**********************************************************************

In [None]:
#Add steps
#https://stackoverflow.com/questions/60252983/adding-new-step-value-column-for-timeseries-data-with-multiple-records-per-tim
train_df['step'] = train_df['date']-train_df['date'].shift(1)     #shift index and find difference
zero = np.timedelta64(0, 's')       
train_df['step'][0] = np.timedelta64(0, 's')          #change first var from naT to zero
train_df['step'] = train_df['step'].apply(lambda x: x>zero).cumsum()

In [None]:
test_df['step'] = test_df['date']-test_df['date'].shift(1)     #shift index and find difference
zero = np.timedelta64(0, 's')       
test_df['step'][0] = np.timedelta64(0, 's')          #change first var from naT to zero
test_df['step'] = test_df['step'].apply(lambda x: x>zero).cumsum()

In [None]:


train_df['year'] = pd.DatetimeIndex(train_df.date).year
test_df['year'] = pd.DatetimeIndex(test_df.date).year

train_df['day'] = pd.DatetimeIndex(train_df.date).day
test_df['day'] = pd.DatetimeIndex(test_df.date).day

train_df['dayofyear'] = pd.DatetimeIndex(train_df.date).dayofyear
test_df['dayofyear'] = pd.DatetimeIndex(test_df.date).dayofyear

train_df['Quarter'] = pd.DatetimeIndex(train_df.date).quarter
test_df['Quarter'] = pd.DatetimeIndex(test_df.date).quarter

train_df['week'] = pd.DatetimeIndex(train_df.date).weekofyear
test_df['week'] = pd.DatetimeIndex(test_df.date).weekofyear

In [None]:
import datetime
import holidays

# Country List:['Finland' 'Norway' 'Sweden']
holiday_FI = holidays.CountryHoliday('FI', years=[2015, 2016, 2017, 2018, 2019])
holiday_NO = holidays.CountryHoliday('NO', years=[2015, 2016, 2017, 2018, 2019])
holiday_SE = holidays.CountryHoliday('SE', years=[2015, 2016, 2017, 2018, 2019])
dictionaries ={'Finland':holiday_FI,'Norway':holiday_NO,'Sweden':holiday_SE}

#add some more celebration days
holiday_FI.update({datetime.date(2015,5,10):'Mothers Day',datetime.date(2016,5,8):'Mothers Day',datetime.date(2017,5,14):'Mothers Day',datetime.date(2018,5,13):'Mothers Day',datetime.date(2019,5,12):'Mothers Day'})
holiday_SE.update({datetime.date(2015,5,31):'Mothers Day',datetime.date(2016,5,29):'Mothers Day',datetime.date(2017,5,28):'Mothers Day',datetime.date(2018,5,27):'Mothers Day',datetime.date(2019,5,26):'Mothers Day'})
holiday_NO.update({datetime.date(2015,5,10):'Mothers Day',datetime.date(2016,2,8):'Mothers Day',datetime.date(2017,2,14):'Mothers Day',datetime.date(2018,2,11):'Mothers Day',datetime.date(2019,2,10):'Mothers Day'})

In [None]:
def add_dic(df):
    ls = []
    for a,b in zip(df.date,df.country):
        if a.date() in list(dictionaries[b].keys()):
            ls.append(dictionaries[b][a])
        else:
            ls.append('Not Holidays')
    df['holiday_name'] = ls
    return df
    
    

In [None]:
train_df = add_dic(train_df)
test_df = add_dic(test_df)

In [None]:
#https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298990
#As discuessed, adding some other celebration day.

#Valentine's Day
#New Year's Eve
#Father's Day

train_df['holiday_name'] = ['Valentine' if a.strftime('%m-%d')=='02-14' else b for a,b in zip(train_df.date,train_df.holiday_name)]
test_df['holiday_name'] = ['Valentine' if a.strftime('%m-%d')=='02-14' else b for a,b in zip(test_df.date,test_df.holiday_name)]


train_df['holiday_name'] = ['New Year Eve' if a.strftime('%m-%d')=='12-31' else b for a,b in zip(train_df.date,train_df.holiday_name)]
test_df['holiday_name'] = ['New Year Eve' if a.strftime('%m-%d')=='12-31' else b for a,b in zip(test_df.date,test_df.holiday_name)]

In [None]:
Fathers = ['2015-11-8','2016-11-13','2017-11-12','2018-11-11','2019-11-10']

In [None]:
train_df['holiday_name'] = ['Fathers Day' if a.strftime('%y-%m-%d') in Fathers else b for a,b in zip(train_df.date,train_df.holiday_name)]


In [None]:
train_df['is_holiday'] = [0 if a == 'Not Holidays' else 1 for a in train_df.holiday_name]
test_df['is_holiday'] = [0 if a == 'Not Holidays' else 1 for a in test_df.holiday_name]

In [None]:
test_df = test_df.replace("Trettondedag jul, Söndag" , "Trettondedag jul")

In [None]:
train_df['holiday_month'] = [1 if a in[1,4,12] else 0 for a in train_df.month]
test_df['holiday_month'] = [1 if a in[1,4,12] else 0 for a in test_df.month]

In [None]:
# count week from each new year day
train_df['week2'] = [int(a/7) +1 for a in train_df['dayofyear']]
test_df['week2'] = [int(a/7) +1 for a in test_df['dayofyear']]

In [None]:
categories = ['country','store','product','holiday_name']
for i in categories:
    encoder = preprocessing.LabelEncoder()
    train_df[i] = encoder.fit_transform(train_df[i])
    test_df[i] =  encoder.transform(test_df[i])

# Making Model and Predict

In [None]:
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

In [None]:
train_df['step^2'] = train_df['step']**2
test_df['step^2']  = test_df['step']**2

In [None]:
features = train_df[['country','store','product','step','step^2']]
targets = train_df['num_sold'] 

In [None]:
# split data
X_train,X_val,y_train,y_val = train_test_split(features,targets,test_size=0.05,shuffle=False)


In [None]:
model2 = LinearRegression()
model2.fit(X_train, y_train)

In [None]:
smape(y_val,model2.predict(X_val))

In [None]:
Finland_0_1 = train_df[(train_df['country'] ==0) & (train_df['store']==0) & (train_df['product']==0)]

In [None]:
Finland_0_1.columns

In [None]:
plt.plot(Finland_0_1.step,Finland_0_1.num_sold)


In [None]:
features = Finland_0_1[['day_of_week',
       'weekend', 'step', 'year', 'day', 'dayofyear', 'Quarter', 'week','week2',
       'month', 'holiday_month','step^2','busiest' ]]
                       
targets = Finland_0_1['num_sold']

In [None]:
# split data
X_train,X_val,y_train,y_val = train_test_split(features,targets,test_size=0.05,shuffle=False)

In [None]:
model1 = LinearRegression()
model1.fit(X_train, y_train)
predictions = model1.predict(X_val)
smape(y_val,predictions)

In [None]:
plt.plot(X_val.index,y_val)
plt.plot(X_val.index,predictions)


In [None]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree

xgb = xgb.XGBRegressor(n_estimators=1000)
xgb.fit(X_train, y_train,
        eval_set=[(X_train,y_train),(X_val, y_val)],
        early_stopping_rounds=25,
       verbose=False)

In [None]:
predictions = xgb.predict(X_val)
plt.plot(X_val.index,predictions)
plt.plot(X_val.index,y_val)


In [None]:
smape(y_val,predictions)

In [None]:
from matplotlib import pyplot
# plot learning curves
results = xgb.evals_result()
plt.figure(figsize=(10, 8))
pyplot.plot(results['validation_0']['rmse'], label='train')
pyplot.plot(results['validation_1']['rmse'], label='test')
# show the legend
pyplot.legend()
plt.xlabel('iterations')
plt.ylabel('rmse')
# show the plot
pyplot.show()

In [None]:
train_df.columns

In [None]:
#https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/302270
# idea using log comes from this discussion.

features = train_df[['country','store','product','Quarter','day','month','year','week2','dayofyear','busiest','day_of_week','step','is_holiday','holiday_name','GDPperCapita','GrowthRate']]
#holiday_month,step^2 

targets = train_df['num_sold']
#targets = np.log(train_df['num_sold'])



In [None]:
targets_log = np.log(targets)

In [None]:
# split data
X_train,X_val,y_train_log,y_val_log = train_test_split(features,targets_log,test_size=0.05,shuffle=False)
X_train,X_val,y_train,y_val = train_test_split(features,targets,test_size=0.05,shuffle=False)

In [None]:
model1 = LinearRegression()
model1.fit(X_train, y_train)
smape(y_val,model1.predict(X_val))

In [None]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree

xgb = xgb.XGBRegressor(learning_rate=0.17)
xgb.fit(X_train, y_train,
        eval_set=[(X_train,y_train),(X_val, y_val)],
        early_stopping_rounds=25,
       verbose=False)
predictions = xgb.predict(X_val)
smape(y_val,predictions)

In [None]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree

xgb = xgb.XGBRegressor(learning_rate=0.17)
xgb.fit(X_train, y_train_log,
        eval_set=[(X_train,y_train_log),(X_val, y_val_log)],
        early_stopping_rounds=25,
       verbose=False)
predictions = xgb.predict(X_val)
smape(np.exp(y_val_log),np.exp(predictions))

In [None]:
plot_importance(xgb)

In [None]:
results = xgb.evals_result()

In [None]:
from matplotlib import pyplot
# plot learning curves
plt.figure(figsize=(10, 8))
pyplot.plot(results['validation_0']['rmse'], label='train')
pyplot.plot(results['validation_1']['rmse'], label='test')
# show the legend
pyplot.legend()
plt.xlabel('iterations')
plt.ylabel('rmse')
# show the plot
pyplot.show()

In [None]:
from catboost import CatBoostRegressor
cat = CatBoostRegressor(n_estimators=600)
cat.fit(X_train,y_train_log,eval_set=(X_val,y_val_log),early_stopping_rounds=500,verbose=False)
smape(np.exp(y_val_log),np.exp(cat.predict(X_val)))

In [None]:
import lightgbm as lgb
lgb = lgb.LGBMRegressor(n_estimators=1000,boosting_type='dart',learning_rate=0.2)
lgb.fit(X_train, y_train)
smape(y_val,lgb.predict(X_val))

In [None]:
import lightgbm as lgb
lgb = lgb.LGBMRegressor(n_estimators=1000,boosting_type='dart',learning_rate=0.2)
lgb.fit(X_train, y_train_log)
smape(np.exp(y_val_log),np.exp(lgb.predict(X_val)))

# Train on Full Train Data

In [None]:
import xgboost as xgb
xgb = xgb.XGBRegressor(learning_rate=0.17)
xgb.fit(features,targets, verbose=False)

In [None]:
import lightgbm as lgb
lgb = lgb.LGBMRegressor(n_estimators=1000,boosting_type='dart',learning_rate=0.2)
lgb.fit(features, targets)

In [None]:
from catboost import CatBoostRegressor
cat = CatBoostRegressor(n_estimators=600,verbose=False)
cat.fit(features,targets_log)

# Making Submission file

In [None]:
predictions1 = xgb.predict(test_df[['country','store','product','day','month','year','week2','busiest','Quarter','dayofyear','day_of_week','step','is_holiday','holiday_name','GDPperCapita','GrowthRate']])
predictions2 = cat.predict(test_df[['country','store','product','day','month','year','week2','busiest','Quarter','dayofyear','day_of_week','step','is_holiday','holiday_name','GDPperCapita','GrowthRate']])
predictions3= lgb.predict(test_df[['country','store','product','day','month','year','week2','busiest','Quarter','dayofyear','day_of_week','step','is_holiday','holiday_name','GDPperCapita','GrowthRate']])

In [None]:
predictions = (predictions1*0.33)+np.exp((predictions2*0.34))+(predictions3*0.33)

In [None]:
predictions

In [None]:
output = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
output['num_sold']= predictions

#idea comes from https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/299162
output['num_sold'] = output['num_sold'].apply(np.ceil)
output.to_csv('submission.csv',index =False)
