# Multi Layer (Deep) Neural Network

In [1]:
from __future__ import print_function # for python 2 and 3 compatibility

import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Load data

In [2]:
# read csv
all_charts = pd.read_csv('BillboardLyricData.txt', sep='\t', encoding='utf-8')
all_charts = all_charts.dropna()

# countvecotrize data
num_features = 500
vectorizer = CountVectorizer(max_df=0.5, min_df=0.0,max_features=num_features,stop_words='english')
X = np.asarray(vectorizer.fit_transform(all_charts.lyrics).todense()).astype(np.float32)

# y to ints
labels = np.unique(all_charts.chart).tolist()
num_labels = len(labels)
class_mapping = {label:idx for idx,label in enumerate(labels)}
y = all_charts.chart.map(class_mapping)

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# scale
std_scaler = StandardScaler()
X_train_std = std_scaler.fit_transform(X_train)
X_test_std = std_scaler.transform(X_test)

## Pure python implementation
### http://cs231n.github.io/neural-networks-case-study/

In [13]:
K = num_labels # num classes/labels
h = 100 # size of hidden layer
D = num_features # num features

# initialize weights randomly
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))

# some hyperparameters
step_size = 1e-0 # learning rate
reg = 1e-3 # regularization strength

# gradient descent loop
num_examples = X_train_std.shape[0]
for i in xrange(500):
  
    # evaluate class scores, [N x K]
    hidden_layer = np.maximum(0, np.dot(X_train_std, W) + b) # note, ReLU activation
    scores = np.dot(hidden_layer, W2) + b2

    # compute the class probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]

    # compute the loss: average cross-entropy loss and regularization
    corect_logprobs = -np.log(probs[range(num_examples),y_train])
    data_loss = np.sum(corect_logprobs)/num_examples
    reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
    loss = data_loss + reg_loss
    if i % 10 == 0:
        print('iteration {}: loss {}'.format(i, loss))

    # compute the gradient on scores
    dscores = probs
    dscores[range(num_examples),y_train] -= 1
    dscores /= num_examples

    # backpropate the gradient to the parameters
    # first backprop into parameters W2 and b2
    dW2 = np.dot(hidden_layer.T, dscores)
    db2 = np.sum(dscores, axis=0, keepdims=True)
    # next backprop into hidden layer
    dhidden = np.dot(dscores, W2.T)
    # backprop the ReLU non-linearity
    dhidden[hidden_layer <= 0] = 0
    # finally into W,b
    dW = np.dot(X_train_std.T, dhidden)
    db = np.sum(dhidden, axis=0, keepdims=True)

    # add regularization gradient contribution
    dW2 += reg * W2
    dW += reg * W

    # perform a parameter update
    W += -step_size * dW
    b += -step_size * db
    W2 += -step_size * dW2
    b2 += -step_size * db2



iteration 0: loss 1.94805310665
iteration 10: loss 1.43314164183
iteration 20: loss 0.651841857816
iteration 30: loss 0.35927634324
iteration 40: loss 0.236928803375
iteration 50: loss 0.187884594259
iteration 60: loss 0.168370936537
iteration 70: loss 0.158399948688
iteration 80: loss 0.153905322709
iteration 90: loss 0.149424519041
iteration 100: loss 0.149823747186
iteration 110: loss 0.143248722543
iteration 120: loss 0.141453195741
iteration 130: loss 0.141021974114
iteration 140: loss 0.139124011079
iteration 150: loss 0.137521723827
iteration 160: loss 0.136973789946
iteration 170: loss 0.135724038279
iteration 180: loss 0.134972371029
iteration 190: loss 0.134330789005
iteration 200: loss 0.134503549468
iteration 210: loss 0.134241714548
iteration 220: loss 0.133845645424
iteration 230: loss 0.133055274863
iteration 240: loss 0.134202263066
iteration 250: loss 0.134877750424
iteration 260: loss 0.132074462909
iteration 270: loss 0.131751540127
iteration 280: loss 0.132559813672