# AML Project - Time Series Forecasting

## Data Stuff

### 1. Utility / Loading Data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [2]:
path = '/kaggle/input/store-sales-time-series-forecasting/'

oil = pd.read_csv(path + 'oil.csv')
holidays = pd.read_csv(path +'holidays_events.csv')
stores = pd.read_csv(path + 'stores.csv')
train = pd.read_csv(path + 'train.csv')
transactions = pd.read_csv(path + 'transactions.csv')
test = pd.read_csv(path + 'test.csv')



## Process Data

holiday handling:

0: work day

1 - weekend / bridge day

2 - holiday

type additional: ??? 

transferred holidays are considered normal days

only first holiday in holidays is considered




In [3]:
#change dtype of date column to datetime
oil['date'] = oil['date'].apply(pd.to_datetime)
holidays['date'] = holidays['date'].apply(pd.to_datetime)
train['date']=pd.to_datetime(train ['date'])
#train['date'] = train['date'].apply(pd.to_datetime)
transactions['date'] = transactions['date'].apply(pd.to_datetime)
test['date'] = test['date'].apply(pd.to_datetime)

In [4]:
from sklearn.preprocessing import LabelEncoder
#expand oil to include all dates + interpolate missing data
oil = oil.set_index('date').asfreq('D').reset_index()
oil['dcoilwtico'] = oil['dcoilwtico'].interpolate('linear').ffill().bfill()
train = train.merge(oil)
train = train.rename(columns={"dcoilwtico": "oilprice"})


#change family names & type to numeric values
encoder_family = LabelEncoder()
train['family_id']=encoder_family.fit_transform(train['family'])


#split up date into multiple informations
train['day'] = train['date'].apply(lambda time: time.day)
train['month'] = train['date'].apply(lambda time: time.month)
train['weekday'] = train['date'].apply(lambda time: time.dayofweek)
train['year'] = train['date'].apply(lambda time: time.year)

#remove noise - half a year after earthquake
to_drop = train.loc[train['date'].between('2016-04-16', '2016-10-16')]
train = train.drop(to_drop.index)

prob. faster: do holiday lookup table with clusters instead of store numbers

In [5]:
#holiday handling
def isholiday(row):
  #data
  date = row['date']
  event = holidays.loc[holidays['date'] == date][0:1]
  id = row['store_nbr']
  city = stores.loc[stores['store_nbr'] == id]['city'].values
  state = stores.loc[stores['store_nbr'] == id]['state'].values
  
  
  #check if events apply:
  if len(event) > 0:
    national = event['locale'].values == 'National'
    regional = event['locale'].values == 'Regional' and event['locale_name'].values == state
    local = event['locale'].values == 'Local' and event['locale_name'].values == city
    
    if national[0] or regional[0] or local[0]:
      if event['type'].values == 'Holiday' and  event['transferred'].values == False:
        return 2
      elif event['type'].values == 'Transfer':
        return 2
      elif event['type'].values == 'Bridge':
        return 1
      elif event['type'].values == 'Work Day':
        return 0

  #otherwise: check if weekend
  if row['weekday']< 5:
    return 0
  else: 
    return 1


#lookup table  (adds dates to stores table and gets holiday type)
date1, date2 = train['date'].min(), train['date'].max()
holiday_lookup = stores[['store_nbr', 'type', 'cluster']].copy()
holiday_lookup.loc[:,'holiday'] = 0

holiday_lookup = pd.merge(holiday_lookup, pd.DataFrame({'date': pd.date_range(date1, date2, freq = 'd')}), how = "cross")
holiday_lookup['weekday'] = holiday_lookup['date'].apply(lambda time: time.dayofweek)
holiday_lookup.loc[:, 'holiday'] = holiday_lookup.apply(lambda row: isholiday(row), axis = 1)
holiday_lookup = holiday_lookup.drop('weekday', axis = 1)


#join with train
train = train.merge(holiday_lookup)

encoder_type = LabelEncoder()
train['type']=encoder_type.fit_transform(train['type'])

In [6]:
# preprocess test data
test = test.merge(oil)
test = test.rename(columns={"dcoilwtico": "oilprice"})



#change family names & type to numeric values
test['family_id']=encoder_family.fit_transform(test['family'])



#split up date into multiple informations
test['day'] = test['date'].apply(lambda time: time.day)
test['month'] = test['date'].apply(lambda time: time.month)
test['weekday'] = test['date'].apply(lambda time: time.dayofweek)
test['year'] = test['date'].apply(lambda time: time.year)

In [7]:
#test holiday handling

#holiday handling
def isholiday(row):
  #data
  date = row['date']
  event = holidays.loc[holidays['date'] == date][0:1]
  id = row['store_nbr']
  city = stores.loc[stores['store_nbr'] == id]['city'].values
  state = stores.loc[stores['store_nbr'] == id]['state'].values
  
  
  #check if events apply:
  if len(event) > 0:
    national = event['locale'].values == 'National'
    regional = event['locale'].values == 'Regional' and event['locale_name'].values == state
    local = event['locale'].values == 'Local' and event['locale_name'].values == city
    
    if national[0] or regional[0] or local[0]:
      if event['type'].values == 'Holiday' and  event['transferred'].values == False:
        return 2
      elif event['type'].values == 'Transfer':
        return 2
      elif event['type'].values == 'Bridge':
        return 1
      elif event['type'].values == 'Work Day':
        return 0

  #otherwise: check if weekend
  if row['weekday']< 5:
    return 0
  else: 
    return 1


#lookup table  (adds dates to stores table and gets holiday type)
date1, date2 = test['date'].min(), test['date'].max()
holiday_lookup = stores[['store_nbr', 'type', 'cluster']].copy()
holiday_lookup.loc[:,'holiday'] = 0

holiday_lookup = pd.merge(holiday_lookup, pd.DataFrame({'date': pd.date_range(date1, date2, freq = 'd')}), how = "cross")
holiday_lookup['weekday'] = holiday_lookup['date'].apply(lambda time: time.dayofweek)
holiday_lookup.loc[:, 'holiday'] = holiday_lookup.apply(lambda row: isholiday(row), axis = 1)
holiday_lookup = holiday_lookup.drop('weekday', axis = 1)


#join with train
test = test.merge(holiday_lookup)
test['type']=encoder_type.fit_transform(test['type'])

In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

## Full Dataset

In [9]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense


cols = ['store_nbr', 'onpromotion', 'oilprice', 'holiday', 'weekday', 'day', 'month', 'type', 'cluster', 'family_id']
X = train[cols].values
Y = train['sales'].values.ravel()

In [10]:
n_features = len(cols)

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)
    
    
def CNN_Model(cols):
  n_features = len(cols)
  model = keras.Sequential()
  model.add(Dense(50, activation='relu'))
  model.add(Dense(1, activation = 'softplus'))
  model.compile(optimizer='adam', loss="mean_squared_logarithmic_error")
  return model

from sklearn import preprocessing

def scale(X,Y):
  scaler = preprocessing.StandardScaler().fit(X)
  X_scaled = scaler.transform(X)
  Y_scaled = scaler.transform(Y)
  return X_scaled, Y_scaled

REPLICAS:  1


In [11]:
family_nbr = train['family_id'].max()
cols = ['onpromotion', 'oilprice', 'holiday', 'weekday', 'day', 'month', 'cluster', 'store_nbr']

test['sales'] = 0


for fam in range(family_nbr):
    idx_train = train['family_id']== fam
    idx_test = test['family_id']== fam
    X_1 = train.loc[idx_train][cols].values
    X_2 = test[idx_test][cols].values
    Y = train.loc[idx_train]['sales'].values.ravel()
      
    X_1, X_2 = scale(X_1, X_2)
    
    model = CNN_Model(cols)
    model.fit(X_1, Y, epochs=10, verbose=0)
    pred = model.predict(X_2)

    test.loc[idx_test,'sales'] = pred
    print('group ', fam, ' done ')



2022-08-23 11:45:09.085447: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-08-23 11:45:09.220671: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


group  0  done 
group  1  done 
group  2  done 
group  3  done 
group  4  done 
group  5  done 
group  6  done 
group  7  done 
group  8  done 
group  9  done 
group  10  done 
group  11  done 
group  12  done 
group  13  done 
group  14  done 
group  15  done 
group  16  done 
group  17  done 
group  18  done 
group  19  done 
group  20  done 
group  21  done 
group  22  done 
group  23  done 
group  24  done 
group  25  done 
group  26  done 
group  27  done 
group  28  done 
group  29  done 
group  30  done 
group  31  done 


In [12]:
test[['id', 'sales']].to_csv('submission.csv',index = False)

submission = pd.read_csv('./submission.csv')
submission

Unnamed: 0,id,sales
0,3000888,4.062679
1,3000889,0.038156
2,3000890,5.920139
3,3000891,3246.311523
4,3000892,0.144904
...,...,...
28507,3029395,233.264191
28508,3029396,112.080444
28509,3029397,794.773376
28510,3029398,52.331882
