In [3]:
import numpy
import scipy.optimize
import random
import collections

In [4]:
def parseData(fname):
  for l in open(fname):
    yield eval(l)

In [5]:
print ("Reading data...")
data = list(parseData("beer_50000.json"))
print ("done")

Reading data...
done


In [4]:
## How many reviews are there for each style of beer in the dataset ('beer/style')? 
data2 = [d['beer/style'] for d in data if 'beer/style' in d]
count_reviews_per_style = collections.Counter(data2)
for key,value in count_reviews_per_style.items():
    print (key + ": " + str(value))

Hefeweizen: 618
English Strong Ale: 164
Foreign / Export Stout: 55
German Pilsener: 586
American Double / Imperial IPA: 3886
Herbed / Spiced Beer: 73
Oatmeal Stout: 102
American Pale Lager: 123
Rauchbier: 1938
American Pale Ale (APA): 2288
American Porter: 2230
Belgian Strong Dark Ale: 146
Russian Imperial Stout: 2695
American Amber / Red Ale: 665
American Strong Ale: 166
MÃ¤rzen / Oktoberfest: 557
American Adjunct Lager: 242
American Blonde Ale: 357
American IPA: 4113
Fruit / Vegetable Beer: 1355
English Bitter: 267
English Porter: 367
Irish Dry Stout: 101
American Barleywine: 825
American Double / Imperial Stout: 5964
Doppelbock: 873
American Stout: 591
Maibock / Helles Bock: 225
Dortmunder / Export Lager: 31
Euro Strong Lager: 329
Low Alcohol Beer: 7
Light Lager: 503
Euro Pale Lager: 701
Bock: 148
English India Pale Ale (IPA): 175
Altbier: 165
KÃ¶lsch: 94
Pumpkin Ale: 560
Rye Beer: 1798
American Pale Wheat Ale: 154
Milk / Sweet Stout: 69
Schwarzbier: 53
Munich Dunkel Lager: 141
Vien

In [5]:
## What is the average value of ‘review/taste’ for reviews from each style?
data3 = [d for d in data if 'beer/style' in d and 'review/taste' in d]
avg_dict = collections.defaultdict(lambda: [0,0], {})
for mydata in data3:
    avg_dict[mydata['beer/style']][0] += mydata['review/taste']
    avg_dict[mydata['beer/style']][1] += 1
for key,value in avg_dict.items():
    print (key + ": " + str(value[0]/value[1]))

Hefeweizen: 3.635113268608414
English Strong Ale: 3.7560975609756095
Foreign / Export Stout: 3.2545454545454544
German Pilsener: 3.667235494880546
American Double / Imperial IPA: 4.033324755532681
Herbed / Spiced Beer: 3.4452054794520546
Oatmeal Stout: 3.7745098039215685
American Pale Lager: 3.2154471544715446
Rauchbier: 4.067853457172343
American Pale Ale (APA): 3.649694055944056
American Porter: 4.081838565022421
Belgian Strong Dark Ale: 3.6952054794520546
Russian Imperial Stout: 4.300371057513915
American Amber / Red Ale: 3.513533834586466
American Strong Ale: 3.569277108433735
MÃ¤rzen / Oktoberfest: 3.5933572710951527
American Adjunct Lager: 2.9483471074380163
American Blonde Ale: 3.2549019607843137
American IPA: 4.00085096036956
Fruit / Vegetable Beer: 3.607749077490775
English Bitter: 3.5374531835205993
English Porter: 3.70708446866485
Irish Dry Stout: 3.623762376237624
American Barleywine: 4.064242424242424
American Double / Imperial Stout: 4.479963112005366
Doppelbock: 3.982817

In [7]:
## Train a simple predictor with a single binary feature indicating whether a beer is an 'American IPA'

data4 = [d for d in data if 'beer/style' in d]

def feature(datum):
  feat = [1]
  if datum['beer/style'] == "American IPA":
    feat.append(1)
  else:
    feat.append(0)
  return feat

X = [feature(d) for d in data4]
y = [d['review/overall'] for d in data4]
theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

print (theta)

[ 3.87517162  0.16458039]


In [7]:
## Split the data into two equal fractions – the first half for training, the second half for testing

halfwaypoint = int(len(data)/2)
data_half1 = data[:halfwaypoint]
data_half2 = data[halfwaypoint:]

relevant_data1 = [d for d in data_half1 if 'beer/style' in d]
relevant_data2 = [d for d in data_half2 if 'beer/style' in d]

def feature(datum):
  feat = [1]
  if datum['beer/style'] == "American IPA":
    feat.append(1)
  else:
    feat.append(0)
  return feat

X = [feature(d) for d in relevant_data1]
y = [d['review/overall'] for d in relevant_data1]
theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

predicted_results1 = [theta[0] + theta[1]*int(d['beer/style'] == "American IPA") for d in relevant_data1]
predicted_results2 = [theta[0] + theta[1]*int(d['beer/style'] == "American IPA") for d in relevant_data2]
actual_results1 = [d['review/overall'] for d in relevant_data1]
actual_results2 = [d['review/overall'] for d in relevant_data2]
mse1 = ((numpy.asarray(predicted_results1) - numpy.asarray(actual_results1)) ** 2).mean(axis=None)
mse2 = ((numpy.asarray(predicted_results2) - numpy.asarray(actual_results2)) ** 2).mean(axis=None)

print ("mse1: " + str(mse1))
print ("mse2: " + str(mse2))

mse1: 0.528174997341
mse2: 0.453828914945


In [8]:
## Extend the model above so that it incorporates binary features for every style of beer with ≥ 50 reviews

styles_with_over_50_reviews = [k for k,v in count_reviews_per_style.items() if v >= 50]
features_size = len(styles_with_over_50_reviews)

def feature(datum):
  feat = [-1]*(features_size + 1)
  feat[0] = 1
  for i in range(1,features_size + 1):
    if datum['beer/style'] == styles_with_over_50_reviews[i-1]:
      feat[i] = 1
    else:
      feat[i] = 0
  return feat

X = [feature(d) for d in relevant_data1]
y = [d['review/overall'] for d in relevant_data1]

# Objective
def f(theta, X, y, lam):
  theta = numpy.matrix(theta).T
  X = numpy.matrix(X)
  y = numpy.matrix(y).T
  diff = X*theta - y
  diffSq = diff.T*diff
  diffSqReg = diffSq / len(X) + lam*(theta.T*theta)
  return diffSqReg.flatten().tolist()[0]

# Derivative
def fprime(theta, X, y, lam):
  theta = numpy.matrix(theta).T
  X = numpy.matrix(X)
  y = numpy.matrix(y).T
  diff = X*theta - y
  res = 2*X.T*diff / len(X) + 2*lam*theta
  return numpy.array(res.flatten().tolist()[0])

results = scipy.optimize.fmin_l_bfgs_b(f, [0]*(features_size + 1), fprime, args = (X, y, 0.1))
theta = results[0]
print (theta)

predicted_results1 = [-1]*len(relevant_data1)
predicted_results2 = [-1]*len(relevant_data2)

for i in range (0,len(relevant_data1)):
  score = theta[0]
  for j in range(0,len(theta) - 1):
    if relevant_data1[i]['beer/style'] == styles_with_over_50_reviews[j]:
      score += theta[j+1]
  predicted_results1[i] = score

for i in range (0,len(relevant_data2)):
  score = theta[0]
  for j in range(0,len(theta) - 1):
    if relevant_data2[i]['beer/style'] == styles_with_over_50_reviews[j]:
      score += theta[j+1]
  predicted_results2[i] = score

actual_results1 = [d['review/overall'] for d in relevant_data1]
actual_results2 = [d['review/overall'] for d in relevant_data2]
mse1 = ((numpy.asarray(predicted_results1) - numpy.asarray(actual_results1)) ** 2).mean(axis=None)
mse2 = ((numpy.asarray(predicted_results2) - numpy.asarray(actual_results2)) ** 2).mean(axis=None)

print ("mse1: " + str(mse1))
print ("mse2: " + str(mse2))

[  3.29711252e+00   4.68507913e-02   2.22551547e-02  -3.75908085e-05
  -1.04152110e-03   1.82767400e-01  -2.01232092e-03   1.11868210e-02
  -1.67140490e-03   1.51535524e-02   1.37392067e-01   1.45449410e-01
   2.36776345e-02   4.46915973e-01   4.35672738e-02   4.70587462e-03
   9.60381727e-03  -1.34009848e-02  -1.04136020e-03   3.28803841e-01
   1.02648715e-01   2.18749257e-02   4.93741200e-02   9.64583332e-03
   1.31054525e-01   6.07261866e-01  -6.86661010e-03   8.18398314e-02
   1.12633509e-03  -3.37163520e-02  -9.29865708e-02  -3.89319016e-02
  -1.43083284e-02   1.70647127e-02   1.65412272e-02   1.94172798e-02
   6.52025861e-02   3.09825833e-02   4.79238750e-03   3.47693620e-03
   4.46338845e-03  -1.36521553e-03   1.09768681e-02   1.40547960e-02
   1.80537547e-02   1.35968144e-02   2.41534989e-02   3.27247152e-02
   3.51405114e-02   8.64011464e-03   3.25522239e-01   5.85788258e-03
   1.36906693e-01   2.82888910e-02   2.71461649e-02   5.36731118e-02
  -3.68874906e-03   3.07877622e-03