In [1]:
from tensorflow.python.compiler.mlcompute import mlcompute
mlcompute.set_mlc_device(device_name='gpu')



<a id='1'></a>
## 1.Importing packages

In [2]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError
from sklearn.preprocessing import StandardScaler


# from tensorflow.keras import Sequential, Input, Model
# from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, BatchNormalization, Dropout, LSTM
# from tensorflow.keras.losses import BinaryCrossentropy
# from tensorflow.keras.metrics import (BinaryAccuracy, Precision, Recall, AUC, RootMeanSquaredError,
#                                       FalsePositives, FalseNegatives, TruePositives, TrueNegatives)
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.activations import relu



<a id='2'></a>
## 2.Read CSV files into DataFrame

You are provided with daily historical sales data. The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. Creating a robust model that can handle such situations is part of the challenge.<br>


<h3>File descriptions</h3>
    <ul>
        <li>sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.</li>
        <li>test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.</li>
        <li>sample_submission.csv - a sample submission file in the correct format.</li>
        <li>items.csv - supplemental information about the items/products.</li>
        <li>item_categories.csv  - supplemental information about the items categories.</li>
        <li>shops.csv- supplemental information about the shops.</li>
    </ul>
            
<h3>Data fields</h3>
    <ul>
        <li>ID - an Id that represents a (Shop, Item) tuple within the test set</li>
        <li>shop_id - unique identifier of a shop</li>
        <li>item_id - unique identifier of a product</li>
        <li>item_category_id - unique identifier of item category</li>
        <li>item_cnt_day - number of products sold. You are predicting a monthly amount of this measure</li>
        <li>item_price - current price of an item</li>
    <li>date - date in format dd/mm/yyyy</li>
    <li>date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33</li>
    <li>item_name - name of item</li>
    <li>shop_name - name of shop</li>
    <li>item_category_name - name of item category</li>
    <li>This dataset is permitted to be used for any purpose, including commercial use.</li>
    </ul>

In [3]:
train = pd.read_csv("../Machine_Learning_with_Scikit_Learn/datasets/predict-future-sales/sales_train.csv")
test = pd.read_csv("../Machine_Learning_with_Scikit_Learn/datasets/predict-future-sales/test.csv")


# submission = pd.read_csv(common_path+'/sample_submission.csv')
# items = pd.read_csv(common_path+'/items.csv')
# item_cats = pd.read_csv(common_path+'/item_categories.csv')
# shops = pd.read_csv(common_path+'/shops.csv')

<a id='3'></a>
## 3.Data Preprocessing

<a id='31'></a>
### 3.1. Missing Values

In [None]:
# Checking for missing values
train.isnull().sum()

<a id='32'></a>
### 3.2. Drop unnecessary columns

In [None]:
df_dropped = train.drop(['date','item_price'], axis=1)
df_dropped.head(3)

In [None]:
df_forecast_dropped = test.drop(['ID'], axis=1)
df_forecast_dropped.head(3)

<a id='33'></a>
### 3.3 Check zero variance features

In [None]:
### It will zero variance features
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(df_dropped)
df_dropped.columns[var_thres.get_support()]

In [None]:
constant_columns = [column for column in df_dropped.columns
                    if column not in df_dropped.columns[var_thres.get_support()]]

print(len(constant_columns))

In [None]:
df_dropped = df_dropped.drop(constant_columns,axis=1)
df_dropped.head()

<a id='34'></a>
### 3.4 Check categorical columns

In [None]:
# There is no categorical columns in the dataframe
categorical_feature_columns = list(set(df_dropped.columns) - set(df_dropped._get_numeric_data().columns))
categorical_feature_columns

In [None]:
numerical_feature_columns = list(df_dropped._get_numeric_data().columns)
numerical_feature_columns

<a id='35'></a>
### 3.5 Investigate all the elements within each Feature

In [None]:
for col in df_dropped:
    unique_vals = np.unique(df_dropped[col])
    nr_values = len(unique_vals)
    if nr_values < 10:
        print(f'The number of values for feature {col} :{nr_values} -- {unique_vals}')
    else:
        print(f'The number of values for feature {col} :{nr_values}')

<a id='36'></a>
### 3.6 Create train and test datasets

In [None]:
df_group = df_dropped.groupby(['date_block_num', 'shop_id', 'item_id']).sum().reset_index()
df_group.head()

In [4]:
#pivot the dataframe and index with item_id and shop_id
train = train.pivot_table(index=['item_id', 'shop_id'],
                            values=['item_cnt_day'],
                            columns='date_block_num', fill_value=0)
#reset the index 
train = train.reset_index()
#show 
train.head()

Unnamed: 0_level_0,item_id,shop_id,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,0,54,,,,,,,,,...,,,,,,,,,,
1,1,55,,,,,,,,,...,,,,,,,,,,
2,2,54,,,,,,,,,...,,,,,,,,,,
3,3,54,,,,,,,,,...,,,,,,,,,,
4,4,54,,,,,,,,,...,,,,,,,,,,


In [5]:
#merge the train set and the test set 
dataset = pd.merge(test, train, on=['item_id', 'shop_id'], how='left')
dataset = dataset.fillna(0)
dataset.head()

Unnamed: 0,ID,shop_id,item_id,"(item_cnt_day, 0)","(item_cnt_day, 1)","(item_cnt_day, 2)","(item_cnt_day, 3)","(item_cnt_day, 4)","(item_cnt_day, 5)","(item_cnt_day, 6)",...,"(item_cnt_day, 24)","(item_cnt_day, 25)","(item_cnt_day, 26)","(item_cnt_day, 27)","(item_cnt_day, 28)","(item_cnt_day, 29)","(item_cnt_day, 30)","(item_cnt_day, 31)","(item_cnt_day, 32)","(item_cnt_day, 33)"
0,0,5,5037,,,,,,,,...,1.0,,,,1.0,1.0,1.0,1.0,1.0,
1,1,5,5320,,,,,,,,...,,,,,,,,,,
2,2,5,5233,,,,,,,,...,,,,,1.5,1.0,,1.0,1.0,1.0
3,3,5,5232,,,,,,,,...,,,,,,,,1.0,,
4,4,5,5268,,,,,,,,...,,,,,,,,,,


In [6]:
#drop  some columns 
dataset.drop(['shop_id', 'item_id', 'ID'], axis=1,inplace=True)
dataset.head()

Unnamed: 0,"(item_cnt_day, 0)","(item_cnt_day, 1)","(item_cnt_day, 2)","(item_cnt_day, 3)","(item_cnt_day, 4)","(item_cnt_day, 5)","(item_cnt_day, 6)","(item_cnt_day, 7)","(item_cnt_day, 8)","(item_cnt_day, 9)",...,"(item_cnt_day, 24)","(item_cnt_day, 25)","(item_cnt_day, 26)","(item_cnt_day, 27)","(item_cnt_day, 28)","(item_cnt_day, 29)","(item_cnt_day, 30)","(item_cnt_day, 31)","(item_cnt_day, 32)","(item_cnt_day, 33)"
0,,,,,,,,,,,...,1.0,,,,1.0,1.0,1.0,1.0,1.0,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,1.5,1.0,,1.0,1.0,1.0
3,,,,,,,,,,,...,,,,,,,,1.0,,
4,,,,,,,,,,,...,,,,,,,,,,


In [7]:
#splitting the dataset 
X_train = np.expand_dims(dataset.values[:, :-1], axis=2)
y_train = dataset.values[:, -1:]
X_test = np.expand_dims(dataset.values[:, 1:], axis=2)

X_train.shape, y_train.shape, X_test.shape

((214200, 33, 1), (214200, 1), (214200, 33, 1))

In [10]:
#buid LSTM Network 
model = Sequential()
model.add(LSTM(units=64, input_shape=(33, 1)))
model.add(Dropout(0.3))
model.add(Dense(1))

model.compile(loss='mse',
              optimizer='adam',
              metrics=['mean_squared_error'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 64)                16896     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 16,961
Trainable params: 16,961
Non-trainable params: 0
_________________________________________________________________


In [12]:
history = model.fit(X_train, y_train,
                    epochs=10)

Epoch 1/10
   5/6694 [..............................] - ETA: 1:28:09 - loss: nan - mean_squared_error: nan

KeyboardInterrupt: 

In [None]:
# df_train = df_group.pivot_table(index=['shop_id','item_id'], columns='date_block_num', values='item_cnt_day', 
#                         fill_value=0)
# df_train.reset_index(inplace=True)

# df_test = pd.merge(df_forecast_dropped, df_train, on=['shop_id','item_id'], how='left').fillna(0)

# df_train.head()

<a id='4'></a>
## 4. Regressions and Results

<a id='41'></a>
### 4.1 Split the data into X & y

In [None]:
X = df_train[df_train.columns[:-1]]
y = df_train[df_train.columns[-1:]]
print(X.shape)
print(y.shape)

In [None]:
from sklearn import preprocessing
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size=0.15, random_state=15)

X_test = df_test.values[:, 1:]

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train).reshape(-1, 35, 1)
X_valid = scaler.fit_transform(X_valid).reshape(-1, 35, 1)
X_test = scaler.fit_transform(X_test).reshape(-1, 35, 1)

# X_train1 = X_train1.reshape(-1, 35, 1)
# X_valid1 = X_valid1.reshape(-1, 35, 1)
# X_test1 = X_test1.reshape(-1, 35, 1)

print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

In [None]:
#buid LSTM Network 
model = Sequential()
model.add(LSTM(units=64, input_shape=(None, 1)))
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(loss='mse',
              optimizer='adam',
              metrics=['mean_squared_error'])
model.summary()

In [None]:
# model = Sequential([
#     LSTM(30, return_sequences=True, input_shape=[None, 1]),
#     LSTM(30, return_sequences=True),
#     LSTM(30),
#     Dense(1)
# ])

# model.compile(optimizer=Adam(learning_rate=0.0005), 
#               loss = 'mse',
#               metrics=[RootMeanSquaredError(name='rmse')])
# model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=5)

Epoch 1/5
   5/6694 [..............................] - ETA: 1:29:27 - loss: nan - mean_squared_error: nan

In [None]:
prediction = model.predict(X_test)

In [None]:
prediction = prediction.clip(0, 20)

In [None]:
prediction.shape

In [None]:
prediction

In [None]:
# history = model.fit(X_train, y_train, epochs=5,
#                     validation_data=(X_valid, y_valid))

In [None]:
df_dropped = df_dropped.drop(constant_columns,axis=1)
df_dropped.head()

In [None]:
prediction = model.predict(X_test)

In [None]:
prediction = prediction.clip(0, 20)

In [None]:
prediction.shape

In [None]:
prediction[0:10]

In [None]:
submission = pd.DataFrame({'ID': df_test_first['ID'], 'item_cnt_month': prediction[:,0].reshape(-1)})
#submission['item_cnt_month'] = submission['item_cnt_month'].round(0)

In [None]:
submission.to_csv('submission.csv',index=False)

In [None]:
# Check the submission csv to make sure it's in the right format
submissions_check = pd.read_csv("submission.csv")
submissions_check.head()

In [None]:
submissions_check['item_cnt_month'].unique()