Data preprocessing for ECommerce Project

In [1]:
import numpy as np
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/ann_logistic_extra/ecommerce_data.csv'
df = pd.read_csv(url, error_bad_lines=False)

In [3]:
df.shape

(500, 6)

In [4]:
df.head(20)

Unnamed: 0,is_mobile,n_products_viewed,visit_duration,is_returning_visitor,time_of_day,user_action
0,1,0,0.65751,0,3,0
1,1,1,0.568571,0,2,1
2,1,0,0.042246,1,1,0
3,1,1,1.659793,1,1,2
4,0,1,2.014745,1,1,2
5,1,1,0.512447,1,1,2
6,0,0,1.440327,1,1,0
7,1,0,0.03526,0,3,0
8,0,1,1.490764,0,0,1
9,0,0,0.005838,1,3,0


In [5]:
#Will turn the dataframe into a numpy matrix. Can be done multiple ways. This is going along with the instructor's code

data = df.as_matrix()

In [6]:
print data

[[ 1.          0.          0.65750995  0.          3.          0.        ]
 [ 1.          1.          0.56857123  0.          2.          1.        ]
 [ 1.          0.          0.042246    1.          1.          0.        ]
 ..., 
 [ 0.          0.          0.1728534   1.          3.          0.        ]
 [ 1.          0.          0.2099644   0.          3.          0.        ]
 [ 0.          0.          2.61688195  1.          3.          0.        ]]


In [7]:
# We split the data into portions we can use. "Y" will be last column (User Action). "X" Will be everything up to it.

X = data[:, :-1] #Means everything from top to bottom, everything left to right minus the right most column
Y = data[:, -1] #Means everything from top to bottom, only right most column
print X.shape
print Y.shape

(500L, 5L)
(500L,)


Now we are going to normalize the numerical columns. (Columns 'n_products_viewed' and 'visit_duration')

Recall this is the (value - mean) / standard deviation

In [8]:
X[:,1] = (X[:,1] - X[:,1].mean()) - X[:,1].std()

In [9]:
X[:,2] = (X[:,2] - X[:,2].mean()) - X[:,2].std()

Now we want to work on the categorical column 'time_of_day'

Recall we are splitting the day up into 4 6-hour long categories

In [10]:
N, D = X.shape #Will split length and width between two variables

X2 = np.zeros((N, D+3)) #We add 3 additional columns because there are 4 different categorical values in 'time_of_day'

In [11]:
X2.shape


(500L, 8L)

Now we are going to put columns 1-4 from the orignal X into X2. Then we will do one hot encoding for time of day options

In [12]:
X2[:,0:(D-1)] = X[:,0:(D-1)]

In [13]:
for n in xrange(N):
    t = int(X[n,D-1]) #goes through every row of column 'time_of_day'
    X2[n,t+D-1] = 1 #Think carefully. Remember 't' will be either 0, 1, 2 or 3. Currently all time columns are zeros. Now this
    #will allow one hot encoding for each time column

Now for the full function to be called later

In [14]:
#Full Function
def get_data():
    url = 'https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/ann_logistic_extra/ecommerce_data.csv'
    df = pd.read_csv(url)

    # just in case you're curious what's in it
    # df.head()

    # easier to work with numpy array
    data = df.as_matrix()

    X = data[:,:-1]
    Y = data[:,-1]

    # normalize columns 1 and 2
    X[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
    X[:,2] = (X[:,2] - X[:,2].mean()) / X[:,2].std()

    # create a new matrix X2 with the correct number of columns
    N, D = X.shape
    X2 = np.zeros((N, D+3))
    X2[:,0:(D-1)] = X[:,0:(D-1)] # non-categorical

    # one-hot
    for n in xrange(N):
        t = int(X[n,D-1])
        X2[n,t+D-1] = 1

    # method 2
    # Z = np.zeros((N, 4))
    # Z[np.arange(N), X[:,D-1].astype(np.int32)] = 1
    # # assign: X2[:,-4:] = Z
    # assert(np.abs(X2[:,-4:] - Z).sum() < 10e-10)

    return X2, Y


def get_binary_data():
    # return only the data from the first 2 classes
    X, Y = get_data()
    X2 = X[Y <= 1]
    Y2 = Y[Y <= 1]
    return X2, Y2

In [15]:
X2.shape

(500L, 8L)

In [16]:
X.shape

(500L, 5L)

In [17]:
print X2

[[ 1.         -1.89931526 -1.37410378 ...,  0.          0.          1.        ]
 [ 1.         -0.89931526 -1.46304249 ...,  0.          1.          0.        ]
 [ 1.         -1.89931526 -1.98936773 ...,  1.          0.          0.        ]
 ..., 
 [ 0.         -1.89931526 -1.85876033 ...,  0.          0.          1.        ]
 [ 1.         -1.89931526 -1.82164933 ...,  0.          0.          1.        ]
 [ 0.         -1.89931526  0.58526823 ...,  0.          0.          1.        ]]
