In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
def get_data():
  df = pd.read_csv("C:/Users/TANNERU/Downloads/Dataset for coding/ecommerce_data.csv")

  # easier to work with numpy array
  data = df.values

  # shuffle it
  np.random.shuffle(data)

  # split features and labels
  X = data[:,:-1]
  Y = data[:,-1].astype(np.int32)
  
  print(X.shape)
  print(Y.shape)

  # one-hot encode the categorical data
  # create a new matrix X2 with the correct number of columns
  N, D = X.shape
  X2 = np.zeros((N, D+3))
  print(X2.shape)
  X2[:,0:(D-1)] = X[:,0:(D-1)] # non-categorical

  # one-hot
  for n in range(N):
      t = int(X[n,D-1])
      X2[n,t+D-1] = 1


  # assign X2 back to X, since we don't need original anymore
  X = X2
  print(X.shape)

  # split train and test
  Xtrain = X[:-100]
  Ytrain = Y[:-100]
  Xtest = X[-100:]
  Ytest = Y[-100:]

  # normalize columns 1 and 2
  for i in (1, 2):
    m = Xtrain[:,i].mean()
    s = Xtrain[:,i].std()
    Xtrain[:,i] = (Xtrain[:,i] - m) / s
    Xtest[:,i] = (Xtest[:,i] - m) / s

  return Xtrain, Ytrain, Xtest, Ytest


def get_binary_data():
  # return only the data from the first 2 classes
  Xtrain, Ytrain, Xtest, Ytest = get_data()
  X2train = Xtrain[Ytrain <= 1]
  Y2train = Ytrain[Ytrain <= 1]
  X2test = Xtest[Ytest <= 1]
  Y2test = Ytest[Ytest <= 1]
  return X2train, Y2train, X2test, Y2test


In [5]:
get_data()

(500, 5)
(500,)
(500, 8)
(500, 8)


(array([[ 1.        ,  0.15664925,  1.96978825, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  2.08463998, -0.9746128 , ...,  1.        ,
          0.        ,  0.        ],
        [ 1.        , -0.80734612, -0.542707  , ...,  1.        ,
          0.        ,  0.        ],
        ...,
        [ 1.        ,  0.15664925,  0.16086681, ...,  0.        ,
          1.        ,  0.        ],
        [ 0.        ,  0.15664925, -1.01975117, ...,  1.        ,
          0.        ,  0.        ],
        [ 0.        , -0.80734612,  4.93591491, ...,  1.        ,
          0.        ,  0.        ]]),
 array([2, 1, 0, 2, 2, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 2, 2, 0, 1, 0, 2, 1, 1, 0,
        1, 1, 2, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 1, 0, 2, 1, 2, 0, 0, 2, 0, 0, 0, 1, 1, 2, 1, 0, 0,
        0, 2, 0, 0, 0, 2, 3, 0, 0, 0, 1, 2, 0, 0, 2, 0, 3, 3, 0, 0, 2, 0,
   

In [6]:
get_binary_data()

(500, 5)
(500,)
(500, 8)
(500, 8)


(array([[ 1.        , -0.80843707,  0.73672238, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        , -0.80843707, -0.24602502, ...,  0.        ,
          0.        ,  0.        ],
        [ 1.        ,  1.05003896, -0.90584228, ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 1.        ,  1.05003896, -0.92726394, ...,  0.        ,
          0.        ,  1.        ],
        [ 0.        ,  0.12080094, -1.01979805, ...,  0.        ,
          0.        ,  1.        ],
        [ 0.        ,  0.12080094, -0.98403177, ...,  1.        ,
          0.        ,  0.        ]]),
 array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
        0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
   

In [7]:
X, Y, _, _ = get_data()

M = 5
D = X.shape[1] #8 columns
K = len(set(Y)) # Y has 0,1,2,3 so  4 unique values  shape 4
w1 = np.random.randn(D,M)
b1 = np.zeros(M)
w2 = np.random.randn(M,K)
b2 = np.zeros(K)
print(w1)
print(b1)
print(w2)
print(b2)


def softmax(a):
    expA = np.exp(a)
    return expA/expA.sum(axis = 1,keepdims=True)

def forward(X,w1,b1,w2,b2):
    Z = np.tanh(X.dot(w1)+b1) #for hidden layer tanh activAtion function 
    print("Hidden layer output:",Z)
    return softmax(Z.dot(w2)+b2) # for output layer softmax activation function


P_Y_X = forward(X,w1,b1,w2,b2) #probabilities values for output
print("output",P_Y_X) 
predictions = np.argmax(P_Y_X,axis = 1) #returns index for highest value 
print(predictions) 
print(predictions.shape)
    

def classification_rate(Y,P):
    return np.mean(Y==P)


print("Classification rate:",classification_rate(Y,predictions))


(500, 5)
(500,)
(500, 8)
(500, 8)
[[-0.2827313   0.66285403 -0.04846943 -2.07270632  0.27754904]
 [ 0.10772192 -0.96777696 -0.80909661 -0.14026829  1.25823178]
 [-0.42395874  0.05977071 -0.30467985  0.24260985 -0.31646713]
 [ 0.76874862 -0.09038474 -1.58659983 -0.20428606  0.89888855]
 [ 1.2674795  -1.0567243   0.29302852 -1.04428563 -0.6165368 ]
 [ 0.10644393 -0.48006256 -1.67704053 -0.1310845   0.54179582]
 [ 1.1695886  -0.11517624 -0.06587321  1.12422099  0.29012634]
 [-1.32057469  0.57354612  0.8154722  -1.06619186 -0.59469624]]
[0. 0. 0. 0. 0.]
[[-0.25278028  0.04761252 -0.84964946 -1.3478357 ]
 [-0.25617262 -0.19779722  0.58019939 -0.55298133]
 [ 0.89192757  0.43215887  1.09001762  0.51760136]
 [ 0.10788578 -1.61944155  0.20053448  1.04745568]
 [ 0.15505934 -2.06302305 -0.76386399 -0.99694551]]
[0. 0. 0. 0.]
Hidden layer output: [[ 0.31208282  0.45061314 -0.99273897 -0.38275797  0.58445477]
 [ 0.46367913  0.87520408  0.30025284 -0.58240654 -0.57207572]
 [ 0.460709    0.2313917  -

In [53]:
X

array([[ 1.        ,  0.15048012, -0.34575774, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        , -0.82035935,  0.00866684, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        ,  1.12131959,  0.20049258, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.        , -0.82035935, -0.76503333, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  1.12131959, -0.96273933, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.15048012,  0.38520658, ...,  0.        ,
         0.        ,  1.        ]])

In [54]:
Y

array([1, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 2, 0, 1, 3, 0, 0, 0, 2, 3, 0, 0,
       2, 2, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 2, 1, 0,
       1, 0, 1, 1, 3, 0, 1, 0, 1, 2, 0, 0, 3, 0, 0, 0, 2, 0, 0, 2, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 2, 1, 0, 1, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 3, 0, 0, 0, 0, 0, 1, 0, 2, 3, 0, 0, 2, 3, 2, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 2, 1, 0, 1, 1, 1, 0, 1, 1, 2, 2, 1,
       0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 1, 0, 2, 1, 1, 3, 0, 2, 0, 2, 0, 1,
       0, 0, 0, 0, 1, 1, 2, 1, 2, 0, 0, 1, 3, 0, 2, 0, 1, 2, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 2, 1, 2, 1, 0, 1, 0, 1, 2, 0, 0, 2, 2, 2, 1, 0,
       0, 0, 0, 0, 0, 0, 2, 1, 0, 3, 2, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 2,
       0, 1, 0, 1, 1, 1, 0, 2, 1, 0, 1, 0, 1, 1, 2, 1, 1, 3, 0, 3, 0, 2,
       0, 0, 1, 0, 0, 1, 1, 1, 2, 1, 2, 0, 2, 0, 0, 0, 1, 1, 2, 0, 0, 0,
       1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0,

In [56]:
X.shape

(400, 8)

In [57]:
X.shape[1]

8

In [58]:
set(Y)

{0, 1, 2, 3}

In [59]:
len(set(Y))

4