In [2]:
import numpy
import urllib
import scipy.optimize
import random
from sklearn import svm
from math import exp
from math import log
from sklearn.linear_model import LogisticRegression

In [3]:
def parseData(fname):
  for l in open(fname):
    yield eval(l)

In [4]:
print ("Reading data...")
data = list(parseData("beer_50000.json"))
print ("done")

Reading data...
done


In [5]:
# train a predictor that estimates whether a beer is an ‘American IPA’ using 'beer/ABV' and 'review/taste'
data2 = [d for d in data if 'beer/ABV' in d and 'review/taste' in d and 'beer/style' in d]

X = [[d['beer/ABV'],d['review/taste']] for d in data2]
y = [int("American IPA" in d['beer/style']) for d in data2]

halfway = int(len(data2)/2)

X_train = X[:halfway]
y_train = y[:halfway]

X_test = X[halfway:]
y_test = y[halfway:]

# Create a support vector classifier object, with regularization parameter C = 1000
clf = svm.SVC(C=1000, kernel='sigmoid')
clf.fit(X_train, y_train)

print("training is done!")

training is done!


In [5]:
train_predictions = clf.predict(X_train)
test_predictions = clf.predict(X_test)

num_correct = 0
for i in range (0, len(X_train)):
  if train_predictions[i] == y_train[i]:
    num_correct += 1
print ("accuracy on training set: " + str(num_correct/len(X_train)))

num_correct = 0
for i in range (0, len(X_test)):
  if test_predictions[i] == y_test[i]:
    num_correct += 1
print ("accuracy on test set: " + str(num_correct/len(X_test)))

accuracy on training set: 0.85312
accuracy on test set: 0.87344


In [6]:
# finding better features for predicting 'American IPA'
data3 = [d for d in data if 'beer/ABV' in d and 'review/palate' in d and 'beer/style' in d]

X = [[1,d['beer/ABV'],d['review/palate']] for d in data3]
y = [int("American IPA" in d['beer/style']) for d in data3]

halfway = int(len(data2)/2)

X_train = X[:halfway]
y_train = y[:halfway]

X_test = X[halfway:]
y_test = y[halfway:]

# Create a support vector classifier object, with regularization parameter C = 1000
clf = svm.SVC(C=1000, kernel='sigmoid')
clf.fit(X_train, y_train)

print("training is done!")

training is done!


In [7]:
train_predictions = clf.predict(X_train)
test_predictions = clf.predict(X_test)

num_correct = 0
for i in range (0, len(X_train)):
  if train_predictions[i] == y_train[i]:
    num_correct += 1
print ("accuracy on training set: " + str(num_correct/len(X_train)))

num_correct = 0
for i in range (0, len(X_test)):
  if test_predictions[i] == y_test[i]:
    num_correct += 1
print ("accuracy on test set: " + str(num_correct/len(X_test)))

accuracy on training set: 0.83884
accuracy on test set: 0.8758


In [8]:
# testing different regularization constants and their accuracy results
clf = svm.SVC(C=0.1, kernel='sigmoid')
clf.fit(X_train, y_train)

print ("Using C = 0.1")

train_predictions = clf.predict(X_train)
test_predictions = clf.predict(X_test)

num_correct = 0
for i in range (0, len(X_train)):
  if train_predictions[i] == y_train[i]:
    num_correct += 1
print ("accuracy on training set: " + str(num_correct/len(X_train)))

num_correct = 0
for i in range (0, len(X_test)):
  if test_predictions[i] == y_test[i]:
    num_correct += 1
print ("accuracy on test set: " + str(num_correct/len(X_test)))

Using C = 0.1
accuracy on training set: 0.91352
accuracy on test set: 0.92184


In [9]:
clf = svm.SVC(C=10, kernel='sigmoid')
clf.fit(X_train, y_train)

print ("Using C = 10")

train_predictions = clf.predict(X_train)
test_predictions = clf.predict(X_test)

num_correct = 0
for i in range (0, len(X_train)):
  if train_predictions[i] == y_train[i]:
    num_correct += 1
print ("accuracy on training set: " + str(num_correct/len(X_train)))

num_correct = 0
for i in range (0, len(X_test)):
  if test_predictions[i] == y_test[i]:
    num_correct += 1
print ("accuracy on test set: " + str(num_correct/len(X_test)))

Using C = 10
accuracy on training set: 0.90752
accuracy on test set: 0.92004


In [10]:
clf = svm.SVC(C=10000, kernel='sigmoid')
clf.fit(X_train, y_train)

print ("Using C = 100000")

train_predictions = clf.predict(X_train)
test_predictions = clf.predict(X_test)

num_correct = 0
for i in range (0, len(X_train)):
  if train_predictions[i] == y_train[i]:
    num_correct += 1
print ("accuracy on training set: " + str(num_correct/len(X_train)))

num_correct = 0
for i in range (0, len(X_test)):
  if test_predictions[i] == y_test[i]:
    num_correct += 1
print ("accuracy on test set: " + str(num_correct/len(X_test)))

Using C = 100000
accuracy on training set: 0.83108
accuracy on test set: 0.86116


In [None]:
def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
  dl = [0.0]*len(theta)
  for i in range(len(X)):
    # Fill in code for the derivative
    for j in range (len(theta)):
      dl[j] -= (sigmoid(inner(theta,X[i])) - y[i])*X[i][j]
  # Negate the return value since we're doing gradient *ascent*
  return numpy.array([-x for x in dl])


X = [[1,d['beer/ABV'],d['review/taste']] for d in data2]
y = [int("American IPA" in d['beer/style']) for d in data2]

halfway = int(len(data2)/2)

X_train = X[:halfway]
X_test = X[halfway:]

# Use a library function to run gradient descent (or you can implement yourself!)
theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
print ("Final log likelihood =", -l)

print (theta)

num_correct = 0

for i in range (0,len(X)):
  if abs(sigmoid(inner(theta,X[i])) - y[i]) < 0.5:
    num_correct += 1

print ("Accuracy = " + str(num_correct/len(X))) # Compute the accuracy

In [12]:
# comparing with built in logistic regression function
logistic = LogisticRegression()
test = logistic.fit(X,y)
print (test.coef_)

theta = test.coef_[0]

num_correct = 0

for i in range (0,len(X)):
  if abs(sigmoid(inner(theta,X[i])) - y[i]) < 0.5:
    num_correct += 1

print ("Accuracy = " + str(num_correct/len(X))) # Compute the accuracy

[[-1.31049343 -0.21663721  0.43895716]]
Accuracy = 0.91752
