In [1]:
import numpy as np
import pandas as pd

In [32]:
df = pd.read_csv('ecommerce_data.csv')
df.head()

Unnamed: 0,is_mobile,n_products_viewed,visit_duration,is_returning_visitor,time_of_day,user_action
0,1,0,0.65751,0,3,0
1,1,1,0.568571,0,2,1
2,1,0,0.042246,1,1,0
3,1,1,1.659793,1,1,2
4,0,1,2.014745,1,1,2


In [34]:
XX = df.as_matrix()
XX[:, 1][0:10]
NN, DD = XX.shape
NN, DD

XX[0, 4]
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### Data Processing-

In [35]:
def get_data():
    df = pd.read_csv('ecommerce_data.csv')
    data = df.as_matrix() # turning into a numpy matrix
    
    ### want to split out X and Y columns ###
    
    X = data[:, :-1]  # all rows and all columns except the last one
    Y = data[:, -1] # all rows and last column
    
    ### Normalize the numerical columns ###
    
    X[:, 1] = (X[:, 1] - X[:, 1].mean()) / X[:, 1].std()  # n_products viewed
    X[:, 2] = (X[:, 2] - X[:, 2].mean()) / X[:, 2].std()  # visit_duration
    
    ### Work on the Categorical Columns ###
    
    # time_of_day has 4 different categorical values(0, 1, 2, 3). i.e. 24 hrs have been divided into 6 hrs each intervals.
    N, D = X.shape # original shape of the data i.e. (500, 5)
    X2 = np.zeros((N, D + 3))  # creating a new array with a new shape
    X2[:, 0:(D - 1)] = X[:, 0:(D - 1)]  # All rows and from 0th column to (D - 1) columns all are same
    
    # for other four columns we will do a One-hot encoding
    for n in range(N):
        t = int(X[n, D - 1])  # (D-1) column is the time_of_day
        X2[n, t+D-1] = 1 # when t=0, t+D-1 = 4 i.e. index = 4 means 5th column set the value to 1 and so on ...
    
    return X2, Y

In [36]:
# Now for the logistic class we only want the binary data and not the full data set-
def get_binary_data():
    X, Y = get_data()
    X2 = X[Y <= 1]
    Y2 = Y[Y <= 1]
    return X2, Y2

### Making Predictions

In [37]:
# So, had it been I have two different files and the data processing file was called as process.py, we would do the following-
# from process import get_binary_data

In [38]:
X, Y = get_binary_data()

In [42]:
X[0:2]

array([[ 1.        , -0.81697841, -0.40827769,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 1.        ,  0.13967078, -0.4994283 ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ]])

In [43]:
Y[0:2]

array([ 0.,  1.])

In [46]:
X.shape

(398, 8)

In [47]:
D = X.shape[1]  # set the dimensionality of the weights
W = np.random.randn(D)
b = 0 # is the bias term

In [48]:
def sigmoid(a):
    return 1 / (1 + np.exp(-a))

In [51]:
def forward(X, W, b):
    return sigmoid(X.dot(W) + b)

In [52]:
P_Y_given_X = forward(X, W, b)

In [54]:
P_Y_given_X.shape  # all teh rows we have selected

(398,)

In [55]:
P_Y_given_X[0:10]

array([ 0.44965537,  0.19915582,  0.19806195,  0.66460073,  0.24687678,
        0.71663679,  0.09401419,  0.56616477,  0.56135316,  0.06632764])

In [57]:
predictions = np.round(P_Y_given_X)  # anything more than .5 will be 1 and viceversa
predictions[0:10]

array([ 0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  0.])

In [58]:
# Now we will write a function to determine our classification rate-
def classification_rate(Y, P):
    return np.mean(Y == P)  # i.e wherever Y == P (i.e. True class equals Predicted class, sample it and find the mean of it)

In [61]:
print("Score: ", classification_rate(Y, predictions))

Score:  0.298994974874


In [62]:
# So, the model accuracy is only 30 %.

So, what we saw is important. If we select the weights randomly then we will not do well with the classification.
So, the next step would be to train these weights so that our model accuracy is improved. Good luck.