#  Revenue forecast as the foundation for dynamic pricing


### Goal : Using Historical data of an Online Pharmacy to predict the revenue by applying **Neural Network**.

We tried to experiment on the dataset using a basic Neural Network model to check if this outperforms a standard Machine Learning model.

## Import Libraries and Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.impute import KNNImputer

In [None]:
train_df = pd.read_csv('Dataset.csv')
items_df = pd.read_csv('items.csv', sep="|")

In [None]:
train_df = train_df.merge(items_df, on='pid',how='left')

## Data Preprocessing

In [None]:
train_df['campaignIndex'] = train_df['campaignIndex'].fillna('UNKNOWN')
train_df['category'] = train_df['category'].fillna('UNKNOWN')
train_df['pharmForm'] = train_df['pharmForm'].fillna('UNKNOWN')


cols = ['pid', 'lineID', 'day']  

for col_name in cols:
    train_df.drop(col_name, inplace=True, axis=1)

cols = ['group', 'content', 'unit', 'manufacturer', 'pharmForm', 'category']
for col_name in cols:
  # grouping by frequency
  fq = train_df.groupby(col_name).size()/len(train_df)
  # mapping values to dataframe
  train_df.loc[:, "{}_freq_encoded".format(col_name)] = train_df[col_name].map(fq)
  # drop original column.
  train_df = train_df.drop([col_name], axis=1)
train_df.head(10)

Unnamed: 0,adFlag,availability,competitorPrice,click,basket,order,price,revenue,genericProduct,salesIndex,campaignIndex,rrp,group_freq_encoded,content_freq_encoded,unit_freq_encoded,manufacturer_freq_encoded,pharmForm_freq_encoded,category_freq_encoded
0,0,2,14.6,1,0,0,16.89,0.0,0,40,UNKNOWN,18.25,0.049838,0.148747,0.349399,0.005196,0.015102,0.003086
1,1,1,8.57,0,1,0,8.75,0.0,1,40,C,18.81,0.003152,0.148747,0.468656,0.046873,0.092496,0.028501
2,0,1,14.77,0,1,0,16.06,0.0,0,53,UNKNOWN,18.48,0.00332,0.001388,0.349399,0.012399,0.00584,0.003881
3,1,1,6.59,0,0,1,6.55,6.55,0,40,UNKNOWN,9.31,0.015847,0.027549,0.174535,0.044391,0.062634,0.022563
4,0,1,4.39,0,0,1,4.14,4.14,0,53,UNKNOWN,8.13,0.00256,0.000884,0.468656,0.009547,0.00428,0.001312
5,0,1,13.66,0,0,1,10.03,10.03,0,52,UNKNOWN,21.6,0.003565,0.003845,0.349399,0.009547,0.043165,0.002097
6,1,1,3.03,0,0,1,3.58,3.58,0,40,UNKNOWN,5.62,0.010595,0.119591,0.174535,0.056759,0.034709,0.023893
7,0,1,8.78,1,0,0,8.75,0.0,0,53,UNKNOWN,11.62,0.060232,0.120066,0.349399,0.019879,0.034709,0.013031
8,0,1,10.84,1,0,0,12.04,0.0,0,53,UNKNOWN,14.19,0.060232,0.148747,0.349399,0.000324,0.076062,0.013031
9,1,1,9.12,1,0,0,8.75,0.0,0,40,UNKNOWN,14.25,0.003628,0.120066,0.174535,0.002569,0.034709,0.023893


In [None]:
y = train_df.pop('revenue')

## Data Transformation

In [None]:
from sklearn.impute import KNNImputer
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (RobustScaler, MinMaxScaler, 
                                   MaxAbsScaler, QuantileTransformer,
                                   FunctionTransformer, OneHotEncoder, StandardScaler)
from scipy.sparse import csc_matrix
import sklearn
sklearn.set_config(display="diagram")

encoded_columns = ['group_freq_encoded', 'content_freq_encoded', 'unit_freq_encoded', 'manufacturer_freq_encoded', 'pharmForm_freq_encoded', 'category_freq_encoded']

ct = make_column_transformer(
        (
            OneHotEncoder(), 
            ['salesIndex','campaignIndex','availability']
        ),
        (
          make_pipeline(KNNImputer(), RobustScaler()), ['competitorPrice']
        ),
        (
          make_pipeline(RobustScaler()),
          ['price', 'rrp']
        ),
        (
        StandardScaler(),  #change scaler  #ternary,4nary
          ['availability', 'group_freq_encoded', 'content_freq_encoded', 'unit_freq_encoded', 'manufacturer_freq_encoded', 'pharmForm_freq_encoded', 'category_freq_encoded']

        ),
    remainder="passthrough"
)



In [None]:
ct

In [None]:
X = ct.fit_transform(train_df)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Data Modelling

In [None]:
from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf
from tensorflow import keras

In [None]:
# # define the keras model

model = keras.Sequential([
keras.layers.Dense(784, kernel_initializer='normal', activation='relu'),
keras.layers.Dense(784, kernel_initializer='normal', activation='relu'),
keras.layers.Dense(784, kernel_initializer='normal', activation='relu'),
keras.layers.Dense(1, kernel_initializer='normal', activation='linear')])

In [None]:
model = Sequential()

# The Input Layer :
model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
model.add(Dense(1, kernel_initializer='normal',activation='linear'))

In [None]:
# checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
# checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
# callbacks_list = [checkpoint]

In [None]:
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

In [None]:
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split = 0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb5db473ee0>

## Model Evaluation

In [None]:
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)



0.7414094685490551

Conclusion: The neural network model has given an r2 score of 74% whereas XGBRegressor is scoring 80.2% on the test data. However, it is a known fact that neural network model does not outperform the Machine Learning models on tabular datasets. Though the NN models can perform to its best with good training and architecture.