<a href="https://colab.research.google.com/github/Tomyao/CSE258/blob/master/Homework_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

In [0]:
### Just the first 5000 reviews

print "Reading data..."
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print "done"

Reading data...
done


In [0]:
### Ignore capitalization and remove punctuation

myStrings = []

punctuation = set(string.punctuation)
for d in data:
  r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
  temp = []
  for w in r.split():
    temp.append(w)
  myStrings.append(temp)

In [0]:
### Problem 1

# Get unique bigrams
bigrams = {}

for s in myStrings:
  if len(s) >= 2:
    index1 = 0
    index2 = 1
    while index2 < len(s):
      bigram = (s[index1],s[index2])
      if bigram not in bigrams:
        bigrams[bigram] = 1
      else:
        bigrams[bigram] += 1
      index1 += 1
      index2 += 1

In [0]:
# Count how many unique bigrams
print len(bigrams)

# Show 5 most common bigrams
import operator
sorted_bigrams = sorted(bigrams.items(), key=operator.itemgetter(1))
for i in range (len(sorted_bigrams)-5, len(sorted_bigrams)):
  print sorted_bigrams[i]

182246
(('on', 'the'), 2033)
(('is', 'a'), 2056)
(('of', 'the'), 2245)
(('in', 'the'), 2595)
(('with', 'a'), 4587)


In [0]:
### Problem 2

# Get 1000 most common bigrams
common_bigrams = []
for i in range (len(sorted_bigrams)-1000, len(sorted_bigrams)):
  (a,b) = sorted_bigrams[i]
  common_bigrams.append(a)

In [0]:
# Build bigram counts for data

bigram_counts = []
for s in myStrings:
  temp = {}
  bigram_counts.append(temp)
  if len(s) >= 2:
    index1 = 0
    index2 = 1
    while index2 < len(s):
      bigram = (s[index1],s[index2])
      if bigram not in temp:
        temp[bigram] = 1
      else:
        temp[bigram] += 1
      index1 += 1
      index2 += 1

In [0]:
# Build an index into data
index = 0
for d in data:
  d['index'] = index
  index += 1

In [0]:
# Define features
def feature(datum):
  feat = [0]*len(common_bigrams)
  for i in range (0, len(common_bigrams)):
    temp1 = bigram_counts[datum['index']]
    temp2 = common_bigrams[i]
    if temp2 in temp1:
      feat[i] = temp1[temp2]
  feat.append(1) #offset
  return feat

In [0]:
# Define error measure (MSE in this case)
def error_measure(predicted, actual):
  total_error = 0.0
  for i in range (0, len(actual)):
    total_error += (predicted[i] - actual[i])**2
  return total_error / len(actual)

In [0]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
predictions = clf.predict(X)

print error_measure(predictions,y)

0.342590585298


In [0]:
### Problem 3

# Get unique unigrams
unigrams = {}

for s in myStrings:
    for word in s:
      if word not in unigrams:
        unigrams[word] = 1
      else:
        unigrams[word] += 1

In [0]:
# Get 1000 most common unigrams/bigrams
sorted_unigrams = sorted(unigrams.items(), key=operator.itemgetter(1))

common_grams = []
unigram_index = len(sorted_unigrams) - 1
bigram_index = len(sorted_bigrams) - 1 

while len(common_grams) < 1000:
  (a1,b1) = sorted_unigrams[unigram_index]
  (a2,b2) = sorted_bigrams[bigram_index]
  if b1 >= b2:
    common_grams.append(['unigram', a1])
    unigram_index -= 1
  else:
    common_grams.append(['bigram', a2])
    bigram_index -= 1

In [0]:
# Build unigram counts for data

unigram_counts = []
for s in myStrings:
  temp = {}
  unigram_counts.append(temp)
  for word in s:
    if word not in temp:
      temp[word] = 1
    else:
      temp[word] += 1

In [0]:
# Define features
def feature(datum):
  feat = [0]*len(common_grams)
  for i in range (0, len(common_grams)):
    if common_grams[i][0] == 'unigram':
      temp1 = unigram_counts[datum['index']]
      temp2 = common_grams[i][1]
      if temp2 in temp1:
        feat[i] = temp1[temp2]
    else:
      temp1 = bigram_counts[datum['index']]
      temp2 = common_grams[i][1]
      if temp2 in temp1:
        feat[i] = temp1[temp2]
  feat.append(1) #offset
  return feat

In [0]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
predictions = clf.predict(X)

print error_measure(predictions,y)

0.289047333034


In [0]:
### Problem 4

theta = clf.coef_

# Add indices to theta
temp = []
index = 0
for coef in theta:
  temp.append([index,coef])
  index += 1
theta = temp

# Sort by coefficient
theta = sorted(theta, key=lambda x: x[1])
  
# Look at most positive weights
for i in range(len(theta)-6,len(theta)-1):
  index = theta[i][0]
  print "Unigram/Bigram: " + str(common_grams[index][1])
  print "Coefficient: " + str(theta[i][1])

# Separate for readability
print ""
  
# Look at most negative weights
for i in range(0,5):
  index = theta[i][0]
  print "Unigram/Bigram: " + str(common_grams[index][1])
  print "Coefficient: " + str(theta[i][1])

Unigram/Bigram: ('the', 'best')
Coefficient: 0.206391095672
Unigram/Bigram: ('not', 'bad')
Coefficient: 0.216877216307
Unigram/Bigram: ('of', 'these')
Coefficient: 0.222834704241
Unigram/Bigram: ('a', 'bad')
Coefficient: 0.228819714269
Unigram/Bigram: sort
Coefficient: 0.519827801204

Unigram/Bigram: ('sort', 'of')
Coefficient: -0.639762149718
Unigram/Bigram: water
Coefficient: -0.27048649883
Unigram/Bigram: corn
Coefficient: -0.237031014604
Unigram/Bigram: ('the', 'background')
Coefficient: -0.216248299595
Unigram/Bigram: straw
Coefficient: -0.195937721779


In [0]:
### Problem 5

import math

# Build idf scores for all unigrams
temp = {}
for d in unigrams:
  temp[d] = 0.0

for d in unigram_counts:
  for u in d:
    temp[u] += 1

idf_scores = {}
for d in temp:
  idf_scores[d] = math.log10(len(unigram_counts) / temp[d])
  
# Print idf for 'foam', 'smell', 'banana', 'lactic', and 'tart'
mywords = ['foam','smell','banana','lactic','tart']

for word in mywords:
  print ("idf for " + word + ": " + str(idf_scores[word]))

idf for foam: 1.13786862069
idf for smell: 0.537901618865
idf for banana: 1.67778070527
idf for lactic: 2.92081875395
idf for tart: 1.80687540165


In [0]:
# Build tf-idf scores for all data
tf_idf = []
for d in unigram_counts:
  temp = {}
  tf_idf.append(temp)
  for u in d:
    temp[u] = d[u]*idf_scores[u]
    
# Print tf-idf for 'foam', 'smell', 'banana', 'lactic', and 'tart' in first review
for word in mywords:
  if word in tf_idf[0]:
    print "tf-idf for " + word + ": " + str(tf_idf[0][word])
  else:
    print "tf-idf for " + word + ": 0"

tf-idf for foam: 2.27573724137
tf-idf for smell: 0.537901618865
tf-idf for banana: 3.35556141053
tf-idf for lactic: 5.8416375079
tf-idf for tart: 1.80687540165


In [0]:
### Problem 6

# Build index mapping for unigrams
unigramtoindex = {}
index = 0
for d in unigrams:
  unigramtoindex[d] = index
  index += 1

In [0]:
# Get representation for first review
my_rep1 = [0]*len(unigrams)
for d in tf_idf[0]:
  my_rep1[unigramtoindex[d]] = tf_idf[0][d]

# Get representation for second review
my_rep2 = [0]*len(unigrams)
for d in tf_idf[1]:
  my_rep2[unigramtoindex[d]] = tf_idf[1][d]

In [0]:
# Calculate their cosine similarity

from scipy import spatial

print 1 - spatial.distance.cosine(my_rep1,my_rep2)

0.0658819397474


In [0]:
### Problem 7
highest_cosine = 0
highest_index = 0

for i in range(1,len(tf_idf)):
  temp = [0]*len(unigrams)
  for d in tf_idf[i]:
    temp[unigramtoindex[d]] = tf_idf[i][d]
    
  mycosine = 1 - spatial.distance.cosine(my_rep1,temp)
  if mycosine > highest_cosine:
    highest_cosine = mycosine
    highest_index = i
    
print highest_cosine
print data[highest_index]['review/text']

0.29686795375
750mL bottle thanks to Chris@Slowbeer. Poured into a Lost Abbey stemmed tulip.		Golden orange, close to translucent (on the first pour at least), capped by a sizable white, typically Belgian-looking head. Good lacing.		Quite strong lactic notes and a sharp organic funk. Pungent stuff. Underneath is bitter citrus pith, floral spice and a hint of sweet esters. In your face with a lot going on. Only issue is the lactic character verges on turning my stomach.		More citric sourness and a bit less lactic character. Grapefruit and lemon rind are prominent, as is the Nelson Sauvin vegetative character, which kind of adheres to the yeast and barnyard funk. Tropical melons and honey provide some sweetness. Decent peppery tang.		Medium, lightly syrupy body with lowish carbonation and a moderately tart, dry finish that has some length to it.		Incomparable to anything I've tried. The Sauvin hops with the Saison yeast is a masterful combination, however there's no shortage of rough edg

In [0]:
### Problem 8

# Get 1000 most common unigrams
common_unigrams = []
for i in range (len(sorted_unigrams)-1000, len(sorted_unigrams)):
  (a,b) = sorted_unigrams[i]
  common_unigrams.append(a)
  
# Create mapping from 1000 most common unigrams to index
commonunitoindex = {}
index = 0
for d in common_unigrams:
  commonunitoindex[d] = index
  index += 1

In [0]:
# Define features
def feature(datum):
  feat = [0]*len(common_unigrams)
  
  for d in tf_idf[datum['index']]:
    if d in common_unigrams:
      feat[commonunitoindex[d]] = tf_idf[datum['index']][d]
  
  feat.append(1) #offset
  return feat

In [0]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
predictions = clf.predict(X)

print error_measure(predictions,y)

0.278648735967
