In [1]:
import gzip
import math
import numpy
import random
import sklearn
import string
from collections import defaultdict
from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE
import dateutil

In [2]:
!pip install gensim



You should consider upgrading via the 'c:\users\bill chen\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.




In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [5]:
dataset = []

f = gzip.open("young_adult_20000.json.gz")
for l in f:
    d = eval(l)
    dataset.append(d)
    if len(dataset) >= 20000:
        break
        
f.close()

In [6]:
answers = {}

In [7]:
### Question 1

In [8]:
len(dataset)

20000

In [9]:
train_data = dataset[:10000]
test_data = dataset[10000:]

In [10]:
punctuation = set(string.punctuation)

In [11]:
def mostCommonUnigrams(train_data):
    wordCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in train_data:
        r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
        for w in r.split():
            wordCount[w] += 1
    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()
    return counts[:1000]

def mostCommonBigrams(train_data):
    wordCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in train_data:
        r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
        unigrams = r.split()
        bigrams = list(zip(unigrams[:-1], unigrams[1:]))
        bigrams = [' '.join(x) for x in bigrams]
        for w in bigrams:
            wordCount[w] += 1
    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()
    return counts[:1000]

def mostCommonBoth(train_data):
    unigrams = mostCommonUnigrams(train_data)
    bigrams = mostCommonBigrams(train_data)
    both = unigrams + bigrams
    both.sort(key=lambda x: x[0])
    both.reverse()
    return both[:1000]

In [12]:
def feature(datum, words):
    words = [w[1] for w in words]
    wordId = dict(zip(words, range(len(words))))
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review_text'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    for w in ws + ws2:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [13]:
for q,wList in ('Q1a', mostCommonUnigrams), ('Q1b', mostCommonBigrams), ('Q1c', mostCommonBoth):
    wordSort = wList(train_data)
    X = [feature(d, wordSort) for d in dataset]
    y = [d['rating'] for d in dataset]
    
    X_train, X_test = X[:10000], X[10000:]
    y_train, y_test = y[:10000], y[10000:]
    
    clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(X_train, y_train)
    
    y_preds = clf.predict(X_test)
    mse = numpy.mean([(a-b)**2 for a,b in zip(y_preds, y_test)])
    answers[q] = [float(mse), [x[1] for x in wordSort[:5]], [x[1] for x in wordSort[-5:]]]

In [14]:
answers

{'Q1a': [1.2271716427139716,
  ['the', 'and', 'a', 'of', 'to'],
  ['crime', 'towards', 'lack', 'information', 'hulk']],
 'Q1b': [1.2850254712593874,
  ['of the', 'in the', 'is a', 'the story', 'and the'],
  ['you do', 'well the', 'trust me', 'to date', 'throughout the']],
 'Q1c': [1.220884794454163,
  ['the', 'and', 'a', 'of', 'to'],
  ['otherwise', 'truth', 'went', 'the manga', 'way to']]}

In [15]:
for q in 'Q1a', 'Q1b', 'Q1c':
    assert len(answers[q]) == 3
    assertFloat(answers[q][0])
    assert [type(x) for x in answers[q][1]] == [str]*5
    assert [type(x) for x in answers[q][2]] == [str]*5

In [None]:
### Question 2

In [21]:
counts = mostCommonUnigrams(train_data)
words = [x[1] for x in counts]

In [24]:
## Document frequency
df = defaultdict(int)
for d in train_data:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in set(r.split()):
        df[w] += 1

In [27]:
first_review = train_data[0]

In [32]:
## Term frequency
tf = defaultdict(int)
r = ''.join([c for c in first_review['review_text'].lower() if not c in punctuation])
for w in r.split():
    tf[w] = 1
    
tfidf = dict(zip(words,[tf[w] * math.log2(len(train_data) / df[w]) for w in words]))
firstTfidfQuery = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]

In [31]:
def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1*norm2)
    return 0

In [34]:
sims = []
for d in train_data[1:]:
    tf = defaultdict(int)
    rev = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in rev.split():
        tf[w] = 1
    tfidfQuery = [tf[w] * math.log2(len(dataset) / df[w]) for w in words]
    sim = Cosine(firstTfidfQuery, tfidfQuery)
    sims.append((sim, d['review_text']))

In [38]:
sims.sort()
sims.reverse()
sims

[(0.3332537766522742,
  "This book is part of a series of graphic accounts of significant modern philosophers and ideas. The original idea behind the series was that you could educate through a combination of image and crisp short summaries of the life and history of complicated people and concepts. \n This is both absurd and helpful. None of these books (largely produced in the post-modern fervour of the 1990s) can do more than skim the surface of a subject. Ideas can be so foreshortened that they are meaningless to the uneducated subject. The graphics are often crude but they serve their purpose, only rarely adding to the obscurities instead of enlightening us. \n On the other hand, they offer two hours (approximately) of comic book summary of the main tenets of a thinker or movement with valuable pointers to further reading or study. They are very useful and entertaining in that context. \n To a great extent, they have been superseded by the internet. Wikipedia and a basic Google se

In [39]:
sim, review = sims[0]

In [40]:
answers['Q2'] = [sim, review]

In [41]:
assert len(answers['Q2']) == 2
assertFloat(answers['Q2'][0])
assert type(answers['Q2'][1]) == str

In [None]:
### Question 3

In [43]:
reviewsPerUser = defaultdict(list)

In [44]:
for d in dataset:
    reviewsPerUser[d['user_id']].append((dateutil.parser.parse(d['date_added']), d['book_id']))

In [58]:
reviewLists = []
for u in reviewsPerUser:
    rl = list(reviewsPerUser[u])
    rl.sort()
    reviewLists.append([x[1] for x in rl])

In [60]:
model = Word2Vec(reviewLists,
                 min_count=1, # Words/items with fewer instances are discarded
                 vector_size=10, # Model dimensionality
                 window=3, # Window size
                 sg=1) # Skip-gram model

In [62]:
model.wv.similar_by_word(dataset[0]['book_id'])

[('8497638', 0.9422543048858643),
 ('25032624', 0.9315677881240845),
 ('21519210', 0.8963538408279419),
 ('22752448', 0.8889612555503845),
 ('5497136', 0.8622767925262451),
 ('13352743', 0.8595312237739563),
 ('19064661', 0.8493546843528748),
 ('29495369', 0.8488900065422058),
 ('300946', 0.8472933769226074),
 ('8500719', 0.846130907535553)]

In [63]:
similarities = model.wv.similar_by_word(dataset[0]['book_id'])[:5]

In [64]:
answers['Q3'] = similarities # probably want model10.wv.similar_by_word(...)[:5]

In [65]:
assert len(answers['Q3']) == 5
assert [type(x[0]) for x in answers['Q3']] == [str]*5
assertFloatList([x[1] for x in answers['Q3']], 5)

In [None]:
### Question 4

In [71]:
ratingMean = sum([d['rating'] for d in dataset]) / len(dataset)

In [73]:
itemAverages = defaultdict(list)
reviewsPerUser = defaultdict(list)
    
for d in dataset:
    i = d['book_id']
    u = d['user_id']
    itemAverages[i].append(d['rating'])
    reviewsPerUser[u].append(d)
    
for i in itemAverages:
    itemAverages[i] = sum(itemAverages[i]) / len(itemAverages[i])

In [89]:
def predictRating(user,item, model):
    ratings = []
    similarities = []
    if not str(item) in model.wv:
        return ratingMean
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        if str(i2) in model.wv:
            similarities.append(Cosine(model.wv[item], model.wv[str(i2)]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [68]:
first_1000_data = dataset[:1000]

In [69]:
first_1000_rating = [d['rating'] for d in first_1000_data]

In [90]:
first_1000_pred = []
for d in first_1000_data:
    u, i = d['user_id'], d['book_id']
    first_1000_pred.append(predictRating(u, i, model))

In [91]:
mse4 = numpy.mean([(a-b)**2 for a, b in zip(first_1000_rating, first_1000_pred)])

In [92]:
answers['Q4'] = mse4

In [93]:
assertFloat(answers['Q4'])

In [None]:
### Q5

In [88]:
model2 = Word2Vec(reviewLists,
                 min_count=5, # Words/items with fewer instances are discarded
                 vector_size=10, # Model dimensionality
                 window=3, # Window size
                 sg=1) # Skip-gram model

In [95]:
first_1000_pred_2 = []
for d in first_1000_data:
    u, i = d['user_id'], d['book_id']
    first_1000_pred_2.append(predictRating(u, i, model2))

In [96]:
mse5 = numpy.mean([(a-b)**2 for a, b in zip(first_1000_rating, first_1000_pred_2)])

In [99]:
answers['Q5'] = ["I changed the min_count variable from 1 to 5",
                 mse5]

In [100]:
assert len(answers['Q5']) == 2
assert type(answers['Q5'][0]) == str
assertFloat(answers['Q5'][1])

In [101]:
f = open("answers_hw4.txt", 'w')
f.write(str(answers) + '\n')
f.close()