In [27]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model
import numpy as np
import random
import gzip
import math

In [28]:
import warnings
warnings.filterwarnings("ignore")

In [29]:
def assertFloat(x): # Checks that an answer is a float
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [30]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [31]:
len(dataset)

10000

In [32]:
answers = {} 

In [33]:
dataset[0]

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '2767052',
 'review_id': '248c011811e945eca861b5c31a549291',
 'rating': 5,
 'review_text': "I cracked and finally picked this up. Very enjoyable quick read - couldn't put it down - it was like crack. \n I'm a bit bothered by the lack of backstory of how Panem and the Hunger Games come about. It is just kind of explained away in a few paragraphs and we are left to accept this very strange world where teenagers are pitted into an arena each year to kill each other? I was expecting it because I've seen Battle Royale, but I would have appreciated knowing more of the backstory of how the world could have come into such a odd state. \n I suppose what makes a book like this interesting is thinking about the strategy of it all. The players are going to be statistically encouraged to band together because they will last longer that way, but by definition of course any partnership will be broken, and the drama of how that unfolds is alw

In [34]:
type(dataset[0])

dict

### Question 1

In [35]:
from typing import List


def feature(datum):
    X = np.array([[review['review_text'].count('!')] for review in datum])
    y = np.array([review['rating'] for review in datum])

    model = linear_model.LinearRegression()
    model.fit(X, y)

    return model

model = feature(dataset)
theta1 = model.coef_
theta0 = model.intercept_
print(theta0, theta1)
X = np.array([[review['review_text'].count('!')] for review in dataset])
y = np.array([review['rating'] for review in dataset])

y_pred = model.predict(X)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y, y_pred)

answers['Q1'] = [theta0, theta1[0], mse]


3.6885330408320325 [0.07109019]


In [36]:
assertFloatList(answers['Q1'], 3) # Check the format of your answer (three floats)

### Question 2

In [37]:
def feature(datum):

    X = np.array([[len(review['review_text']), review['review_text'].count('!')] for review in datum]) 
    y = np.array([review['rating'] for review in datum])

    model = linear_model.LinearRegression()
    model.fit(X, y)

    return model
model = feature(dataset)
theta1 = model.coef_
theta0 = model.intercept_
print(theta0, theta1)

X = np.array([[len(review['review_text']), review['review_text'].count('!')] for review in dataset]) 
y = np.array([review['rating'] for review in dataset])

y_pred = model.predict(X)

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y, y_pred)

answers['Q2'] = [theta0, theta1[0],theta1[1], mse]

3.7175128077972013 [-4.12150653e-05  7.52759173e-02]


In [38]:
assertFloatList(answers['Q2'], 4)

### Question 3

In [39]:
from sklearn.preprocessing import PolynomialFeatures
def feature(datum, deg):

    #mses 
    mses = []
    # feature for a specific polynomial degree
    X = np.array([[review['review_text'].count('!')] for review in datum])
    y = np.array([review['rating'] for review in datum])
    for i in range(1, deg + 1):
        poly = PolynomialFeatures(degree = i)  
        X_poly = poly.fit_transform(X)        

        #train model
        model = linear_model.LinearRegression()
        model.fit(X_poly, y)
        #predict
        y_pred = model.predict(X_poly)

        # mse
        mse = mean_squared_error(y, y_pred)
        mses.append(mse)
    return mses

mses = feature(dataset, 5)

answers['Q3'] = mses



In [40]:
assertFloatList(answers['Q3'], 5)# List of length 5

### Question 4

In [41]:
X = np.array([[review['review_text'].count('!')] for review in dataset])
y = np.array([review['rating'] for review in dataset])

#split data into training and testing
X_train = X[:5000]
X_test = X[5000:]
y_train = y[:5000]
y_test = y[5000:]

def feature(X,y,X_test,y_test, deg):
    mses = []
    for i in range(1, deg + 1):
        poly = PolynomialFeatures(degree = i)  
        X_poly = poly.fit_transform(X_train)        

        #train model
        model = linear_model.LinearRegression()
        model.fit(X_poly, y_train)
        #predict
        X_poly_test = poly.fit_transform(X_test)
        y_pred = model.predict(X_poly_test)

        # mse
        mse = mean_squared_error(y_test, y_pred)
        mses.append(mse)
    return mses

mses = feature(X_train, y_train, X_test, y_test, 5)

In [42]:
answers['Q4'] = mses

In [43]:
assertFloatList(answers['Q4'], 5)

### Question 5

In [45]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 计算MAE
mae = np.mean(np.abs(y_test - y_pred))

answers['Q5'] = mae
assertFloat(answers['Q5'])

### Question 6

In [46]:
f = open("beer_50000.json")
dataset = []
for l in f:
    if 'user/gender' in l:
        dataset.append(eval(l))

In [47]:
len(dataset)

20403

In [48]:
for k,v in dataset[0].items():
    print(k,v)


review/appearance 4.0
beer/style American Double / Imperial IPA
review/palate 4.0
review/taste 4.5
beer/name Cauldron DIPA
review/timeUnix 1293735206
user/gender Male
user/birthdayRaw Jun 16, 1901
beer/ABV 7.7
beer/beerId 64883
user/birthdayUnix -2163081600
beer/brewerId 1075
review/timeStruct {'isdst': 0, 'mday': 30, 'hour': 18, 'min': 53, 'sec': 26, 'mon': 12, 'year': 2010, 'yday': 364, 'wday': 3}
user/ageInSeconds 3581417047
review/overall 4.0
review/text According to the website, the style for the Caldera Cauldron changes every year. The current release is a DIPA, which frankly is the only cauldron I'm familiar with (it was an IPA/DIPA the last time I ordered a cauldron at the horsebrass several years back). In any event... at the Horse Brass yesterday.		The beer pours an orange copper color with good head retention and lacing. The nose is all hoppy IPA goodness, showcasing a huge aroma of dry citrus, pine and sandlewood. The flavor profile replicates the nose pretty closely in thi

In [49]:
X = np.array([[review['review/text'].count('!')] for review in dataset])
y = np.array([1 if review['user/gender'] == 'Female' else 0 for review in dataset])



def feature(X,y):
    model = linear_model.LogisticRegression()
    model.fit(X, y)
    return model

model = feature(X,y)
y_pred = model.predict(X)

from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

fpr = fp / (fp + tn)  # False Positive Rate
fnr = fn / (fn + tp)  # False Negative Rate
ber = (fpr + fnr) / 2

answers['Q6'] = [tp,tn,fp,fn,ber]
assertFloatList(answers['Q6'], 5)


### Question 7

In [50]:
#Retrain the regressor using the class weight=’balanced’ option, 
# and report the same error metrics as above.

def feature(X,y):
    model = linear_model.LogisticRegression(class_weight='balanced')
    model.fit(X, y)
    return model

model = feature(X,y)
y_pred = model.predict(X)
#print(y_pred[0:10])
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

fpr = fp / (fp + tn)  # False Positive Rate
fnr = fn / (fn + tp)  # False Negative Rate
ber = (fpr + fnr) / 2

answers['Q7'] = [tp,tn,fp,fn,ber]
assertFloatList(answers['Q7'], 5)

### Question 8

In [51]:
#Report the precision@K of your balanced classifier for K ∈ [1, 10, 100, 1000, 10000] (your answer should
#be a list of five precision values).
y_probs = model.predict_proba(X)[:, 1]  # p(y=1|x)

y_pred = model.predict(X)

# precision @ K
precision_at_k = []
ks = [1, 10, 100, 1000, 10000]

for k in ks:
    # top k predictions
    indices = np.argsort(y_probs)[-k:]  
    true_positives = np.sum(y[indices] == 1)  
    precision = true_positives / k  # precision@k
    precision_at_k.append(precision)

# precision@k
print(precision_at_k)

answers['Q8'] = precision_at_k
assertFloatList(answers['Q8'], 5)

[0.0, 0.0, 0.02, 0.025, 0.017]


In [52]:
f = open("answers_hw1.txt", 'w') # Write your answers to a file
f.write(str(answers) + '\n')
f.close()

In [53]:
for k,v in answers.items():
    print(k,v)

Q1 [3.6885330408320325, 0.07109019019954116, 1.5231747404538287]
Q2 [3.7175128077972013, -4.1215065294879717e-05, 0.07527591733232616, 1.5214029246165832]
Q3 [1.5231747404538287, 1.5046686106250915, 1.4966845515179232, 1.490447730223069, 1.4896106953961648]
Q4 [1.5248743859866298, 1.4977199259322431, 1.4856632190311343, 1.4767337440077455, 1.4809577272113095]
Q5 0.9612280163687501
Q6 [0, 20095, 0, 308, 0.5]
Q7 [88, 16332, 3763, 220, 0.4507731134255145]
Q8 [0.0, 0.0, 0.02, 0.025, 0.017]
