In [30]:
import numpy as np
import pandas as pd
import missingno as msno
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_excel('so_data.xlsx')

In [3]:
df

Unnamed: 0,product,season,discount_percentage,sales_uplift_norm
0,A,summer,0.266667,-0.144010
1,A,summer,0.266667,0.317256
2,A,winter,0.266667,0.084408
3,A,monsoon,0.100000,0.122893
4,A,winter,0.100000,-0.154395
...,...,...,...,...
556,C,monsoon,0.266667,-0.152529
557,C,summer,0.111111,0.140318
558,C,monsoon,0.111111,0.132683
559,C,summer,0.388889,0.233849


In [4]:
def pre_process_data(df,product):
    data = df.copy().reset_index()
#     print(data)
    bp = product
    print("----------product: {}----------".format(bp))
    # Pre-processing steps
    print("pre process df.shape {}".format(df.shape))
        #1. Reponse var transformation
    response = data.sales_uplift_norm # already transformed

        #2. predictor numeric var transformation 
    numeric_vars = ['discount_percentage'] # may include mrp, depth
    df_numeric = data[numeric_vars]
    df_norm = df_numeric.apply(lambda x: scale(x), axis = 0) # center and scale

        #3. char fields dummification
    #select category fields
    cat_cols = data.select_dtypes('category').columns
    #select string fields
    str_to_cat_cols = data.drop(['product'], axis = 1).select_dtypes('object').astype('category').columns
    # combine all categorical fields
    all_cat_cols = [*cat_cols,*str_to_cat_cols]
#     print(all_cat_cols)

    #convert cat to dummies
    df_dummies = pd.get_dummies(data[all_cat_cols])

        #4. combine num and char df together
    df_combined = pd.concat([df_dummies.reset_index(drop=True), df_norm.reset_index(drop=True)], axis=1)
    
    df_combined['sales_uplift_norm'] = response
    df_processed = df_combined.copy()
    print("post process df.shape {}".format(df_processed.shape))
#     print("model fields: {}".format(df_processed.columns))
    return(df_processed)


def rf_fit(df, random_state = 12):
    
    train_features = df.drop('sales_uplift_norm', axis = 1)
    train_labels = df['sales_uplift_norm']
    
    # Random Forest Regressor
    rf = RandomForestRegressor(n_estimators = 500,
                               random_state = random_state,
                               bootstrap = True,
                               oob_score=True)
    # RF model
    rf_fit = rf.fit(train_features, train_labels)

    return(rf_fit)

In [5]:
dd = pre_process_data(df,'A')

----------product: A----------
pre process df.shape (561, 4)
post process df.shape (561, 5)


In [6]:
rfa = rf_fit(dd)

In [7]:
rfa.score(dd[dd.columns[:-1]], dd[dd.columns[-1:]])

0.2547495330403349

In [8]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.activations import relu

In [9]:
train_features = dd.drop('sales_uplift_norm', axis = 1).to_numpy()
train_labels = dd['sales_uplift_norm'].to_numpy()

In [10]:
train_features.shape

(561, 4)

In [13]:
inp = Input(shape=(4,))
x = Dense(6, activation=relu)(inp)
x = Dense(6, activation=relu)(x)
out = Dense(1)(x)

model = Model(inp, out)
model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 30        
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 79
Trainable params: 79
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.compile(optimizer='rmsprop',loss='mse', metrics='accuracy')
model.fit(train_features, train_labels, epochs=10)

In [26]:
model.layers[1].get_weights()[0]

array([[ 0.6263983 , -0.53436553, -0.75924736,  0.20317674, -0.41735935,
        -0.44787022],
       [ 0.22142923, -0.61395985,  0.44862485,  0.7127708 ,  0.6143671 ,
        -0.28417382],
       [-0.17884833,  0.7507889 ,  0.61793447,  0.68952596, -0.09700739,
         0.47548485],
       [-0.31446475,  0.5164157 , -0.5096545 , -0.49994653,  0.33174765,
        -0.6019063 ]], dtype=float32)