## CSE 258, Fall 2018: Homework 4

In [1]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model

In [2]:
def parseDataFromFile(fname):
  for l in open(fname):
    yield eval(l)

In [3]:
data = list(parseDataFromFile("beer_50000.json"))[:5000]

#### Tasks
#### Using the code provided on the webpage, read the first 5000 reviews from the corpus, and read the reviews without capitalization or punctuation.

#### 1. How many unique bigrams are there amongst all of the reviews? List the 5 most-frequently-occurring bigrams along with their number of occurrences in the corpus (1 mark).

In [4]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
  r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
  k = r.split()
  for i in range(1,len(k)):
    wordCount[k[i-1]+' '+k[i]] += 1
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts]

In [5]:
len(numpy.unique(words))

182246

In [6]:
counts[:5]

[(4587, 'with a'),
 (2595, 'in the'),
 (2245, 'of the'),
 (2056, 'is a'),
 (2033, 'on the')]

#### 2. The code provided performs least squares using the 1000 most common unigrams. Adapt it to use the 1000 most common bigrams and report the MSE obtained using the new predictor (use bigrams only, i.e., not unigrams+bigrams) (1 mark). Note that the code performs regularized regression with a regularization parameter of 1.0.

In [7]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
  r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
  k = r.split()
  for i in range(1,len(k)):
    wordCount[k[i-1]+' '+k[i]] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

In [8]:
### Sentiment analysis

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum):
  feat = [0]*len(words)
  r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
  k = r.split()
  for i in range(1,len(k)):
    if (k[i-1]+' '+k[i]) in words:
      feat[wordId[k[i-1]+' '+k[i]]] += 1
  feat.append(1) #offset
  return feat

X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

In [9]:
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [10]:
from sklearn.metrics import mean_squared_error

In [11]:
mean_squared_error(y,predictions)

0.34315301406136334

#### 3. What is the inverse document frequency of the words 'foam', 'smell', 'banana', 'lactic', and 'tart'? What are their tf-idf scores in the first review (using log base 10) (1 mark)?

In [12]:
import math
import collections

In [13]:
def tf(word, r):
    count = 0
    k = r.split()
    m = collections.Counter(k)
    count = m[word]
    return count

In [14]:
def n_containing(word, bloblist):
    count = 0
    for l in bloblist:
        if word in l.split():
            count+=1
    return count

In [15]:
def idf(word, bloblist):
    if n_containing(word,bloblist) == 0:
        return 0
    else:
        return math.log10(len(bloblist) / n_containing(word, bloblist))

In [16]:
reviews = []
for l in data:
    r = ''.join([c for c in l['review/text'].lower() if not c in punctuation])
    reviews.append(r)

In [17]:
p = ['foam','smell','banana','lactic','tart']
for i in p:
    print('Inverse document frequency of ' + i + ' is ' + str(idf(i, reviews)))

Inverse document frequency of foam is 1.1378686206869628
Inverse document frequency of smell is 0.5379016188648442
Inverse document frequency of banana is 1.6777807052660807
Inverse document frequency of lactic is 2.9208187539523753
Inverse document frequency of tart is 1.8068754016455384


In [18]:
for i in p:
    print('In first review, tf-idf score of ' + i + ' is ' + str(tf(i,reviews[0])*idf(i,reviews)))

In first review, tf-idf score of foam is 2.2757372413739256
In first review, tf-idf score of smell is 0.5379016188648442
In first review, tf-idf score of banana is 3.3555614105321614
In first review, tf-idf score of lactic is 5.841637507904751
In first review, tf-idf score of tart is 1.8068754016455384


#### 4. What is the cosine similarity between the first and the second review in terms of their tf-idf representations (considering unigrams only) (1 mark)?

In [19]:
k1 = reviews[0].split()
tfidf1 = defaultdict(int)
for i in k1:
    tfidf1[i]=tf(i,reviews[0])*idf(i,reviews)

In [20]:
k2 = reviews[1].split()
tfidf2 = defaultdict(int)
for i in k2:
    tfidf2[i]=tf(i,reviews[1])*idf(i,reviews)

In [21]:
intersection = set(tfidf1.keys()) & set(tfidf2.keys())

In [22]:
sum1 = sum([tfidf1[x]**2 for x in tfidf1.keys()])
sum2 = sum([tfidf2[x]**2 for x in tfidf2.keys()])

In [23]:
k = sum([tfidf1[x] * tfidf2[x] for x in intersection])/(math.sqrt(sum1) * \
                                                        math.sqrt(sum2))

In [24]:
print('The cosine similarity between the first and the second review is ' + \
      str(k) + '.')

The cosine similarity between the first and the second review is 0.0658819397474438.


#### 5. Which other review has the highest cosine similarity compared to the first review (provide the beerId and profileName, or the text of the review) (1 mark)?

In [25]:
wordlist =[]
for i in reviews:
    r =i.split()
    for k in r:
        wordlist.append(k)

In [26]:
wordSet = set(wordlist)

In [27]:
def computetf(review):
    tfdict = dict.fromkeys(wordSet, 0)
    worddict = dict.fromkeys(wordSet, 0)
    r = review.split()
    for word in r:
        worddict[word]+=1
    for word in r:
        tfdict[word] = worddict[word]
    return tfdict

In [28]:
count = dict.fromkeys(wordSet, 0)

In [29]:
for l in reviews:
    r = l.split()
    for word in wordSet:
        if word in r:
            count[word] += 1

In [30]:
idf = {}
for key in count:
    idf[key] = math.log10(5000/count[key])

In [31]:
# compute tfidf in the first review
k = reviews[0].split()
l = computetf(reviews[0])
k = set(k)

In [32]:
tfidf = dict.fromkeys(wordSet, 0)
for word in k:
    tfidf[word] = l[word] * idf[word]

In [33]:
sim = []
for i in reviews[1:]:
    q = set(i.split())
    l = computetf(i)
    tfidftest = dict.fromkeys(wordSet, 0)
    for word in q:
        tfidftest[word] = l[word] * idf[word]
    intersection = set(tfidf.keys()) & set(tfidftest.keys())
    sum1 = sum([tfidf[x]**2 for x in tfidf.keys()])
    sum2 = sum([tfidftest[x]**2 for x in tfidftest.keys()])
    p = math.sqrt(sum1) * math.sqrt(sum2)
    if p == 0:
        sim.append(0)
    else:
        k = sum([tfidf[x] * tfidftest[x] for x in intersection])/p
        sim.append(k)

In [34]:
max(sim)

0.2968679537499197

In [35]:
sim.index(max(sim))

2342

In [36]:
data[2343]['beer/beerId']

'72146'

#### 6. Adapt the original model that uses the 1000 most common unigrams, but replace the features with their 1000-dimensional tf-idf representations, and report the MSE obtained with the new model.

In [37]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
  r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
  for w in r.split():
    wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

In [38]:
wordId = dict(zip(words, range(len(words))))


def feature(datum):
  feat = [0]*len(words)
  r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
  tfidftest = dict.fromkeys(wordSet, 0)
  l = computetf(r)
  for w in r.split():
      tfidftest[w] = l[w] * idf[w]
      if w in words:
          feat[wordId[w]] = tfidftest[w]
  feat.append(1) #offset
  return feat

X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

In [39]:
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [40]:
mean_squared_error(y, predictions)

0.27875956007772185

#### 7. Implement a validation pipeline for this same data, by randomly shuffling the data, using 5000 reviews for training, another 5000 for validation, and another 5000 for testing. Consider regularization parameters in the range {0.01, 0.1, 1, 10, 100}, and report MSEs on the test set for the model that performs best on the validation set. Using this pipeline, compare the following alternatives in terms of their performance:
#### Unigrams vs. bigrams
#### Removing punctuation vs. preserving it. The model that preserves punctuation should treat punctuation characters as separate words, e.g. "\Amazing!" would become ['amazing', '!']
#### tfidf vs. word counts
#### In total you should compare 2 * 2 * 2 = 8 models, and produce a table comparing their performance (2 marks)

In [41]:
wholedata = list(parseDataFromFile("beer_50000.json"))

In [42]:
from sklearn.utils import shuffle

In [43]:
data_shuffle = shuffle(wholedata, random_state = 0)

In [44]:
training = data_shuffle[:5000]
validation = data_shuffle[5000:10000]
testing = data_shuffle[10000:15000]

In [45]:
#unigram+ remove punctuation + tfidf
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in wholedata:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

def abc(dataset):
    reviews = []
    for l in dataset:
        r = ''.join([c for c in l['review/text'].lower() if not c in punctuation])
        reviews.append(r)
    
    wordlist =[]
    for i in reviews:
        r =i.split()
        for k in r:
            wordlist.append(k)
    wordSet = set(wordlist)
    
    def computetf(review):
        tfdict = dict.fromkeys(wordSet, 0)
        worddict = dict.fromkeys(wordSet, 0)
        r = review.split()
        for word in r:
            worddict[word]+=1
        for word in r:
            tfdict[word] = worddict[word]
        return tfdict
    
    count = dict.fromkeys(wordSet, 0)
    for l in reviews:
        r = l.split()
        for word in wordSet:
            if word in r:
                count[word] += 1
    
    idf = {}
    for key in count:
        idf[key] = math.log10(5000/count[key])
    
    wordId = dict(zip(words, range(len(words))))

    def feature(datum):
      feat = [0]*len(words)
      r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
      tfidftest = dict.fromkeys(wordSet, 0)
      l = computetf(r)
      for w in r.split():
          tfidftest[w] = l[w] * idf[w]
          if w in words:
              feat[wordId[w]] = tfidftest[w]
      feat.append(1) #offset
      return feat
                
    X = [feature(d) for d in dataset]
    y = [d['review/overall'] for d in dataset]
    return X,y

In [46]:
X_train,y_train = abc(training)
X_valid,y_valid = abc(validation)
X_test,y_test = abc(testing)

In [47]:
para = [0.01, 0.1, 1, 10, 100]
for i in para:
    clf = linear_model.Ridge(i, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_valid)
    print(mean_squared_error(y_valid, predictions))

0.3986589211029011
0.3985506569923159
0.39763546672588973
0.39197566669507206
0.4105118967587855


In [48]:
#select 10
clf = linear_model.Ridge(10, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
un_re_tfidf = mean_squared_error(y_test, predictions)

In [49]:
#unigram + remove punctuation + wordCount
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in wholedata:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

def qwe(dataset):
    ### Sentiment analysis

    wordId = dict(zip(words, range(len(words))))
    wordSet = set(words)

    def feature(datum):
      feat = [0]*len(words)
      r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
      for w in r.split():
        if w in words:
          feat[wordId[w]] += 1
      feat.append(1) #offset
      return feat

    X = [feature(d) for d in dataset]
    y = [d['review/overall'] for d in dataset]
    return X,y

In [50]:
X_train,y_train = qwe(training)
X_valid,y_valid = qwe(validation)
X_test,y_test = qwe(testing)

In [51]:
para = [0.01, 0.1, 1, 10, 100]
for i in para:
    clf = linear_model.Ridge(i, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_valid)
    print(mean_squared_error(y_valid, predictions))

0.3997975396787234
0.3995938284487193
0.3976329722324505
0.38384042573876026
0.40114684488089125


In [53]:
#select 10
clf = linear_model.Ridge(10, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
un_re_wc = mean_squared_error(y_test, predictions)

In [54]:
#unigram + preserve punctuation + wordCount
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in wholedata:
    r = ''.join([c for c in d['review/text'].lower()])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

def qwe(dataset):
    ### Sentiment analysis

    wordId = dict(zip(words, range(len(words))))
    wordSet = set(words)

    def feature(datum):
      feat = [0]*len(words)
      r = ''.join([c for c in datum['review/text'].lower()])
      for w in r.split():
        if w in words:
          feat[wordId[w]] += 1
      feat.append(1) #offset
      return feat

    X = [feature(d) for d in dataset]
    y = [d['review/overall'] for d in dataset]
    return X,y

In [55]:
X_train,y_train = qwe(training)
X_valid,y_valid = qwe(validation)
X_test,y_test = qwe(testing)

In [56]:
para = [0.01, 0.1, 1, 10, 100]
for i in para:
    clf = linear_model.Ridge(i, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_valid)
    print(mean_squared_error(y_valid, predictions))

0.44101397877820875
0.4407646891754004
0.4383794404862169
0.4215491733753499
0.42483820735502975


In [57]:
#select 10
clf = linear_model.Ridge(10, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
un_pr_wc = mean_squared_error(y_test, predictions)

In [58]:
#unigram+ preserve punctuation + tfidf
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in wholedata:
    r = ''.join([c for c in d['review/text'].lower()])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

def abc(dataset):
    reviews = []
    for l in dataset:
        r = ''.join([c for c in l['review/text'].lower()])
        reviews.append(r)
    
    wordlist =[]
    for i in reviews:
        r =i.split()
        for k in r:
            wordlist.append(k)
    wordSet = set(wordlist)
    
    def computetf(review):
        tfdict = dict.fromkeys(wordSet, 0)
        worddict = dict.fromkeys(wordSet, 0)
        r = review.split()
        for word in r:
            worddict[word]+=1
        for word in r:
            tfdict[word] = worddict[word]
        return tfdict
    
    count = dict.fromkeys(wordSet, 0)
    for l in reviews:
        r = l.split()
        for word in wordSet:
            if word in r:
                count[word] += 1
    
    idf = {}
    for key in count:
        idf[key] = math.log10(5000/count[key])
    
    wordId = dict(zip(words, range(len(words))))

    def feature(datum):
      feat = [0]*len(words)
      r = ''.join([c for c in datum['review/text'].lower()])
      tfidftest = dict.fromkeys(wordSet, 0)
      l = computetf(r)
      for w in r.split():
          tfidftest[w] = l[w] * idf[w]
          if w in words:
              feat[wordId[w]] = tfidftest[w]
      feat.append(1) #offset
      return feat
                
    X = [feature(d) for d in dataset]
    y = [d['review/overall'] for d in dataset]
    return X,y

In [59]:
X_train,y_train = abc(training)
X_valid,y_valid = abc(validation)
X_test,y_test = abc(testing)

In [60]:
para = [0.01, 0.1, 1, 10, 100]
for i in para:
    clf = linear_model.Ridge(i, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_valid)
    print(mean_squared_error(y_valid, predictions))

0.44000105219714086
0.43987523794358513
0.438776347186248
0.4309630424068235
0.440141292113085


In [61]:
#select 10
clf = linear_model.Ridge(10, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
un_pr_tfidf = mean_squared_error(y_test, predictions)

In [62]:
#bigrams+ preserve punctuation + tfidf
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in wholedata:
  r = ''.join([c for c in d['review/text'].lower()])
  k = r.split()
  for i in range(1,len(k)):
    wordCount[k[i-1]+' '+k[i]] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

def abc(dataset):
    reviews = []
    for l in dataset:
        r = ''.join([c for c in l['review/text'].lower()])
        reviews.append(r)
    
    wordlist =[]
    for i in reviews:
        r =i.split()
        for k in r:
            wordlist.append(k)
    wordSet = set(wordlist)
    
    def computetf(review):
        tfdict = dict.fromkeys(wordSet, 0)
        worddict = dict.fromkeys(wordSet, 0)
        r = review.split()
        for word in r:
            worddict[word]+=1
        for word in r:
            tfdict[word] = worddict[word]
        return tfdict
    
    count = dict.fromkeys(wordSet, 0)
    for l in reviews:
        r = l.split()
        for word in wordSet:
            if word in r:
                count[word] += 1
    
    idf = {}
    for key in count:
        idf[key] = math.log10(5000/count[key])
    
    wordId = dict(zip(words, range(len(words))))

    def feature(datum):
      feat = [0]*len(words)
      r = ''.join([c for c in datum['review/text'].lower()])
      tfidftest = dict.fromkeys(wordSet, 0)
      l = computetf(r)
      for w in r.split():
          tfidftest[w] = l[w] * idf[w]
          if w in words:
              feat[wordId[w]] = tfidftest[w]
      feat.append(1) #offset
      return feat
                
    X = [feature(d) for d in dataset]
    y = [d['review/overall'] for d in dataset]
    return X,y

In [63]:
X_train,y_train = abc(training)
X_valid,y_valid = abc(validation)
X_test,y_test = abc(testing)

In [64]:
para = [0.01, 0.1, 1, 10, 100]
for i in para:
    clf = linear_model.Ridge(i, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_valid)
    print(mean_squared_error(y_valid, predictions))

0.4657165521143806
0.4657192764878528
0.4657470490704898
0.4660774478986139
0.4744431468665896


In [65]:
#select 0.01
clf = linear_model.Ridge(0.01, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
bi_pr_tfidf = mean_squared_error(y_test, predictions)

In [66]:
#bigrams+ remove punctuation + tfidf
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in wholedata:
  r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
  k = r.split()
  for i in range(1,len(k)):
    wordCount[k[i-1]+' '+k[i]] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

def abc(dataset):
    reviews = []
    for l in dataset:
        r = ''.join([c for c in l['review/text'].lower() if not c in punctuation])
        reviews.append(r)
    
    wordlist =[]
    for i in reviews:
        r =i.split()
        for k in r:
            wordlist.append(k)
    wordSet = set(wordlist)
    
    def computetf(review):
        tfdict = dict.fromkeys(wordSet, 0)
        worddict = dict.fromkeys(wordSet, 0)
        r = review.split()
        for word in r:
            worddict[word]+=1
        for word in r:
            tfdict[word] = worddict[word]
        return tfdict
    
    count = dict.fromkeys(wordSet, 0)
    for l in reviews:
        r = l.split()
        for word in wordSet:
            if word in r:
                count[word] += 1
    
    idf = {}
    for key in count:
        idf[key] = math.log10(5000/count[key])
    
    wordId = dict(zip(words, range(len(words))))

    def feature(datum):
      feat = [0]*len(words)
      r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
      tfidftest = dict.fromkeys(wordSet, 0)
      l = computetf(r)
      for w in r.split():
          tfidftest[w] = l[w] * idf[w]
          if w in words:
              feat[wordId[w]] = tfidftest[w]
      feat.append(1) #offset
      return feat
                
    X = [feature(d) for d in dataset]
    y = [d['review/overall'] for d in dataset]
    return X,y

In [67]:
X_train,y_train = abc(training)
X_valid,y_valid = abc(validation)
X_test,y_test = abc(testing)

In [68]:
para = [0.01, 0.1, 1, 10, 100]
for i in para:
    clf = linear_model.Ridge(i, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_valid)
    print(mean_squared_error(y_valid, predictions))

0.4657165521143806
0.4657192764878528
0.4657470490704898
0.4660774478986139
0.4744431468665896


In [69]:
#select 0.01
clf = linear_model.Ridge(0.01, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
bi_re_tfidf = mean_squared_error(y_test, predictions)

In [70]:
#bigrams+ remove punctuation + wordCount
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in wholedata:
  r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
  k = r.split()
  for i in range(1,len(k)):
    wordCount[k[i-1]+' '+k[i]] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

def qwe(dataset):
    ### Sentiment analysis

    wordId = dict(zip(words, range(len(words))))

    def feature(datum):
      feat = [0]*len(words)
      r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
      k = r.split()
      for i in range(1,len(k)):
        if (k[i-1]+' '+k[i]) in words:
          feat[wordId[k[i-1]+' '+k[i]]] += 1
      feat.append(1) #offset
      return feat

    X = [feature(d) for d in dataset]
    y = [d['review/overall'] for d in dataset]
    return X,y

In [71]:
X_train,y_train = qwe(training)
X_valid,y_valid = qwe(validation)
X_test,y_test = qwe(testing)

In [72]:
para = [0.01, 0.1, 1, 10, 100]
for i in para:
    clf = linear_model.Ridge(i, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_valid)
    print(mean_squared_error(y_valid, predictions))

0.4529285368480005
0.4526318044571937
0.4497842009703396
0.42982363119607697
0.42097146953442255


In [73]:
#select 100
clf = linear_model.Ridge(100, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
bi_re_wc = mean_squared_error(y_test, predictions)

In [74]:
#bigrams+ preserve punctuation + wordCount
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in wholedata:
  r = ''.join([c for c in d['review/text'].lower()])
  k = r.split()
  for i in range(1,len(k)):
    wordCount[k[i-1]+' '+k[i]] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

def qwe(dataset):
    ### Sentiment analysis

    wordId = dict(zip(words, range(len(words))))

    def feature(datum):
      feat = [0]*len(words)
      r = ''.join([c for c in datum['review/text'].lower()])
      k = r.split()
      for i in range(1,len(k)):
        if (k[i-1]+' '+k[i]) in words:
          feat[wordId[k[i-1]+' '+k[i]]] += 1
      feat.append(1) #offset
      return feat

    X = [feature(d) for d in dataset]
    y = [d['review/overall'] for d in dataset]
    return X,y

In [75]:
X_train,y_train = qwe(training)
X_valid,y_valid = qwe(validation)
X_test,y_test = qwe(testing)

In [76]:
para = [0.01, 0.1, 1, 10, 100]
for i in para:
    clf = linear_model.Ridge(i, fit_intercept=False)
    clf.fit(X_train, y_train)
    theta = clf.coef_
    predictions = clf.predict(X_valid)
    print(mean_squared_error(y_valid, predictions))

0.46929095358475725
0.4688647062685668
0.46483016573941605
0.4385983464589267
0.42563702854142965


In [77]:
#select 100
clf = linear_model.Ridge(100, fit_intercept=False)
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
bi_pr_wc = mean_squared_error(y_test, predictions)

In [78]:
import pandas as pd

In [79]:
result = numpy.array([['','preserve_punc','remove_punc'],
                ['uni_wordcount',un_pr_wc,un_re_wc],
                ['uni_tfidf',un_pr_tfidf,un_re_tfidf],
                ['bi_wordcount',bi_pr_wc,bi_re_wc],
                ['bi_tfidf',bi_pr_tfidf,bi_re_tfidf]])

In [81]:
print(pd.DataFrame(data=result[1:,1:],
                  index=result[1:,0],
                  columns=result[0,1:]))

                     preserve_punc          remove_punc
uni_wordcount    0.413599935746171  0.38046060252020125
uni_tfidf      0.42259774591516014   0.3885811901840076
bi_wordcount   0.43219295621818926   0.4243905355380374
bi_tfidf        0.4782321775831297   0.4782321775831297
