In [1]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

In [113]:
### Just the first 5000 reviews

print "Reading data..."
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print "done"

Reading data...
done


In [114]:
###  task 1

In [149]:
# bigrams count
bigramCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    # Ignore capitalization and remove punctuation
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    textList = r.split()
    for i in range(len(textList)-1):
        bigramCount[textList[i] + ' ' + textList[i+1]] += 1

In [151]:
# 5 most-frequently-occurring bigrams
counts = sorted(bigramCount.items(), key=lambda x:x[1], reverse=True)
print len(counts)
for i in range(5):
    print counts[i]

182246
('with a', 4587)
('in the', 2595)
('of the', 2245)
('is a', 2056)
('on the', 2033)


In [152]:
###  task 2

In [153]:
words = [x[0] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [154]:
def feature (datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    textList = r.split()
    for i in range(len(textList)-1):
        w = textList[i] + ' ' + textList[i+1]
        if w in words: 
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [155]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

# With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [156]:
# MSE
MSE = 0
for i in range(len(predictions)):
    MSE += (y[i] - predictions[i])**2
MSE /= len(y)
print MSE

0.343312798776


In [132]:
### task 3

In [133]:
# mix : unigram + bigram
mixCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    if d['review/text'] == '':
        continue
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    textList = r.split()
    for i in range(len(textList)-1):
        mixCount[textList[i]] += 1
        mixCount[textList[i] + " " + textList[i+1]] += 1
    mixCount[textList[len(textList)-1]] += 1

In [134]:
counts = sorted(mixCount.items(), key=lambda x:x[1], reverse=True)
for i in range(5):
    print counts[i]

('a', 30695)
('the', 27569)
('and', 19512)
('of', 15935)
('is', 12623)


In [135]:
words = [x[0] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [136]:
def feature (datum):
    feat = [0]*len(words)
    if datum['review/text'] != '':
        r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
        textList = r.split()
        for i in range(len(textList)-1):
            wB = textList[i] + " " + textList[i+1]
            wU = textList[i]
            if wB in words: 
                feat[wordId[wB]] += 1
            if wU in words: 
                feat[wordId[wU]] += 1
        if textList[len(textList)-1] in words:
            feat[wordId[textList[len(textList)-1]]] += 1
    feat.append(1) #offset
    return feat

In [137]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

# With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [138]:
# MSE
MSE = 0
for i in range(len(predictions)):
    MSE += (y[i] - predictions[i])**2
MSE /= len(y)
print MSE

0.289393697641


In [148]:
### task 4
mostWords = zip(theta[:1000], range(len(theta[:1000])))
mostWords.sort()
negatives = [words[mostWords[i][1]] for i in range(5)]
print "negatives: ", negatives
mostWords.reverse()
positives = [words[mostWords[i][1]] for i in range(5)]
print "positives: ", positives

negatives:  ['sort of', 'water', 'corn', 'the background', 'straw']
positives:  ['sort', 'a bad', 'of these', 'not bad', 'the best']


In [157]:
### task 5
# idf tf-idf
# ‘foam’, ‘smell’, ‘banana’, ‘lactic’, and ‘tart’

In [160]:
wordList = ['foam', 'smell', 'banana', 'lactic', 'tart']

In [None]:
# tf
def tf(term):
    freq = 0
    r = ''.join([c for c in data[0]['review/text'].lower() if not c in punctuation])
    for w in r.split():
        if w == term:
            freq += 1
    return freq        