In [56]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model

In [57]:
import warnings
warnings.filterwarnings("ignore")

In [58]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [59]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [60]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [61]:
answers = {}

In [62]:
# Some data structures that will be useful

In [63]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [64]:
len(allRatings)

200000

In [65]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    usersPerItem[b].add(u)
    itemsPerUser[u].add(b)


In [116]:
ratingsTrain[0]

('u93397390', 'b52690052', 3)

In [66]:
##################################################
# Read prediction                                #
##################################################

In [67]:
# Copied from baseline code
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalRead/2: break

In [68]:
### Question 1

In [69]:
# 构建验证集
validPosData = ratingsValid # 正样本
validNegData = [] # 负样本

# 为每个正样本创建一个对应的负样本
for u,b,r in validPosData:
    # 获取该用户已读过的所有书
    readBooks = set(b for b,r in ratingsPerUser[u])
    
    # 随机选择一本用户没读过的书
    while True:
        # 从所有书中随机选择一本
        negBook = random.choice(list(bookCount.keys()))
        # 确保这本书用户没读过
        if negBook not in readBooks:
            validNegData.append((u, negBook, 0))
            break

# 合并正负样本
validWithNeg = validPosData + validNegData

# 评估基线模型性能
correct = 0
total = len(validWithNeg)

for u,b,r in validWithNeg:
    # Make prediction using return1 baseline model
    prediction = 1 if b in return1 else 0
    # Check if prediction is correct
    if prediction == (r > 0):
        correct += 1

acc1 = correct / total

In [112]:
acc1

0.7183

In [70]:
len(validWithNeg)

20000

In [71]:
answers['Q1'] = acc1

In [72]:
assertFloat(answers['Q1'])

In [73]:
### Question 2

In [74]:
best_threshold = 0
best_acc = 0
best_popular_set = None
# test 10% to 90%
for threshold_ratio in numpy.arange(0.1, 0.91, 0.05):
    popular_set = set()
    count = 0
    threshold = totalRead * threshold_ratio
    
    # 构建popular_set
    for ic, i in mostPopular:
        count += ic
        popular_set.add(i)
        if count > threshold:
            break
    
    # 在验证集上评估
    correct = 0
    for u,b,r in validWithNeg:
        prediction = 1 if b in popular_set else 0
        if prediction == (r > 0):
            correct += 1
    
    acc = correct / len(validWithNeg)
    
    # 更新最佳结果
    if acc > best_acc:
        best_acc = acc
        best_threshold = threshold_ratio
        best_popular_set = popular_set

# 保存最佳阈值和准确率
threshold = best_threshold
acc2 = best_acc

In [114]:
print(threshold, acc2)

0.10010000000000001 0.7456


In [75]:
prediction1 = []
for u,b,r in validWithNeg:
    prediction = 1 if b in best_popular_set else 0
    prediction1.append(prediction)

In [76]:
answers['Q2'] = [threshold, acc2]

In [77]:
assertFloat(answers['Q2'][0])
assertFloat(answers['Q2'][1])

In [78]:
### Question 3/4

In [107]:
def cosine_similarity(s1, s2):
    intersection = s1.intersection(s2)
    if not intersection:
        return 0.0
    return len(intersection) / (math.sqrt(len(s1)) * math.sqrt(len(s2)))

def pearson_similarity(s1, s2):
    intersection = s1.intersection(s2)
    n = len(intersection)
    if n == 0:
        return 0
    sum1 = sum(s1)
    sum2 = sum(s2)
    sum1_sq = sum([v**2 for v in s1])
    sum2_sq = sum([v**2 for v in s2])
    product_sum = sum([v1 * v2 for v1, v2 in zip(s1, s2)])
    numerator = product_sum - (sum1 * sum2 / n)
    denominator = math.sqrt((sum1_sq - sum1**2 / n) * (sum2_sq - sum2**2 / n))
    if denominator == 0:
        return 0
    return numerator / denominator

In [79]:
def Jaccard(s1, s2):
    return len(s1.intersection(s2)) / len(s1.union(s2))


In [115]:
best_threshold

0.0001

In [108]:
from sklearn.ensemble import GradientBoostingClassifier

# 构建特征矩阵 X 和标签 y
X = []
y = []
for u, b, r in ratingsTrain:
    # 构建特征向量
    feature_vector = [1 if b in return1 else 0]
    user_books = itemsPerUser[u]
    max_sim = max([Jaccard(usersPerItem[b], usersPerItem[read_book]) for read_book in user_books], default=0)
    feature_vector.append(1 if max_sim > best_threshold else 0)
    X.append(feature_vector)
    y.append(1 if r > 0 else 0)

# 训练梯度提升模型
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
gbm.fit(X, y)

# 在验证集上评估
X_valid = []
y_valid = []
for u, b, r in ratingsValid:
    feature_vector = [1 if b in return1 else 0]
    user_books = itemsPerUser[u]
    max_sim = max([Jaccard(usersPerItem[b], usersPerItem[read_book]) for read_book in user_books], default=0)
    feature_vector.append(1 if max_sim > best_threshold else 0)
    X_valid.append(feature_vector)
    y_valid.append(1 if r > 0 else 0)

acc = gbm.score(X_valid, y_valid)
print(f"梯度提升树的准确率: {acc}")

梯度提升树的准确率: 0.948


In [80]:

# 测试不同的阈值
thresholds = numpy.arange(0.0001, 0.2, 0.100)  # 更细致的阈值范围
best_predictions = []
best_acc = 0
from tqdm import tqdm
for threshold in tqdm(thresholds):
    correct = 0
    current_predictions = []  # 当前阈值下的预测结果
    
    for u, b, r in validWithNeg:
        # 获取用户已读的所有书
        user_books = itemsPerUser[u]
        
        # 计算最大 Jaccard 相似度
        max_sim = 0
        for read_book in user_books:
            sim = Jaccard(usersPerItem[b], usersPerItem[read_book])
            max_sim = max(max_sim, sim)
        
        # 基于阈值进行预测
        prediction = 1 if max_sim > threshold else 0
        current_predictions.append(prediction)
        if prediction == (r > 0):
            correct += 1

    acc = correct / len(validWithNeg)
    
    # 更新最佳结果
    if acc > best_acc:
        best_acc = acc
        best_threshold = threshold
        best_predictions = current_predictions.copy()  # 保存最佳预测结果

# 保存最佳准确率和预测结果
acc3 = best_acc
prediction3 = best_predictions  # 这个将用于Q4

print(f"Best threshold: {best_threshold}")
print(f"Best accuracy: {acc3}")
print(f"Number of predictions: {len(prediction3)}")

100%|██████████| 2/2 [00:03<00:00,  1.77s/it]

Best threshold: 0.0001
Best accuracy: 0.67385
Number of predictions: 20000





In [81]:
#Q4
X = numpy.array([(p1,p3) for p1,p3 in zip(prediction1, prediction3)])
y = numpy.array([(r > 0) for _,_,r in validWithNeg])

import sklearn
model = sklearn.linear_model.LogisticRegression(class_weight='balanced')
model.fit(X, y)

y_pred = model.predict(X)
acc4 = numpy.mean(y_pred == y)  


In [82]:
print(acc3, acc4)

0.67385 0.7456


In [83]:
answers['Q3'] = acc3
answers['Q4'] = acc4

In [84]:
assertFloat(answers['Q3'])
assertFloat(answers['Q4'])

In [111]:
best_threshold


0.0001

In [109]:
predictions = open("predictions_Read.csv", 'w')

for l in open("pairs_Read.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    #gbt prediction
    prediction = gbm.predict([[1 if b in return1 else 0, 1 if max_sim > best_threshold else 0]])
    predictions.write(u + ',' + b + ',' + str(prediction) + '\n')

predictions.close()

In [110]:
answers['Q5'] = "I confirm that I have uploaded an assignment submission to gradescope"

In [87]:
assert type(answers['Q5']) == str

In [88]:
##################################################
# Rating prediction                              #
##################################################

In [89]:
users = list(set(u for u,_,_ in ratingsTrain))
items = list(set(b for _,b,_ in ratingsTrain))
len(users)


27914

In [90]:
from scipy.sparse import lil_matrix
from sklearn.linear_model import Ridge

def fit_bias_model_sklearn_sparse(train_data, valid_data, lambda_reg=1.0):
    # 构建用户和物品的索引映射
    users = list(set(u for u,_,_ in train_data))
    items = list(set(b for _,b,_ in train_data))
    user_to_idx = {u:i for i,u in enumerate(users)}
    item_to_idx = {b:i for i,b in enumerate(items)}
    
    n_users = len(users)
    n_items = len(items)
    
    # 使用稀疏矩阵构建训练数据
    X_train = lil_matrix((len(train_data), n_users + n_items))
    y_train = numpy.zeros(len(train_data))
    
    for i, (u,b,r) in enumerate(train_data):
        X_train[i, user_to_idx[u]] = 1  # 用户one-hot编码
        X_train[i, n_users + item_to_idx[b]] = 1  # 物品one-hot编码
        y_train[i] = r
    
    # 训练模型
    model = Ridge(alpha=lambda_reg, fit_intercept=True, solver='sag')
    model.fit(X_train, y_train)
    
    # 使用稀疏矩阵构建验证数据
    X_valid = lil_matrix((len(valid_data), n_users + n_items))
    y_valid = numpy.zeros(len(valid_data))
    
    for i, (u,b,r) in enumerate(valid_data):
        if u in user_to_idx and b in item_to_idx:
            X_valid[i, user_to_idx[u]] = 1
            X_valid[i, n_users + item_to_idx[b]] = 1
        y_valid[i] = r
    
    y_pred = model.predict(X_valid)
    valid_mse = numpy.mean((y_valid - y_pred) ** 2)
    
    return valid_mse, model.intercept_, model.coef_[:n_users], model.coef_[n_users:], model

# 运行模型
validMSE, alpha, beta_user, beta_item, model = fit_bias_model_sklearn_sparse(ratingsTrain, ratingsValid)

In [91]:
### Question 6

In [106]:
print(validMSE)

1.4105993861905997


In [92]:
answers['Q6'] = validMSE

In [93]:
assertFloat(answers['Q6'])

In [94]:
### Question 7

In [95]:
max_beta_user_idx = numpy.argmax(beta_user)
min_beta_user_idx = numpy.argmin(beta_user)

maxUser = users[max_beta_user_idx]
minUser = users[min_beta_user_idx]

maxBeta = float(beta_user[max_beta_user_idx])
minBeta = float(beta_user[min_beta_user_idx])

In [96]:
print(maxUser, minUser, maxBeta, minBeta)
print(type(maxUser), type(minUser), type(maxBeta), type(minBeta))

u18223169 u88024921 1.8198605120695182 -3.562425412210544
<class 'str'> <class 'str'> <class 'float'> <class 'float'>


In [97]:
answers['Q7'] = [maxUser, minUser, maxBeta, minBeta]

In [98]:
assert [type(x) for x in answers['Q7']] == [str, str, float, float]

In [99]:
### Question 8

In [100]:
est_lambda = None
best_mse = float('inf')

# 测试不同的λ值
lambda_values = numpy.arange(0.1, 10.1, 0.5)
lambda_reg = 4.6
validMSE, _, _, _, model = fit_bias_model_sklearn_sparse(ratingsTrain, ratingsValid, lambda_reg)

for lambda_reg in tqdm(lambda_values):
    validMSE, _, _, _, model = fit_bias_model_sklearn_sparse(ratingsTrain, ratingsValid, lambda_reg)
    if validMSE < best_mse:
        best_mse = validMSE
        best_lambda = lambda_reg
        best_model = model
lamb = best_lambda
validMSE = best_mse

100%|██████████| 20/20 [00:25<00:00,  1.29s/it]


In [101]:
print(lamb, validMSE)

4.6 1.4105993861905997


In [102]:
answers['Q8'] = (lamb, validMSE)

In [103]:
assertFloat(answers['Q8'][0])
assertFloat(answers['Q8'][1])

In [104]:
user_to_idx = {u:i for i,u in enumerate(users)}
item_to_idx = {b:i for i,b in enumerate(items)}
predictions = open("predictions_Rating.csv", 'w')
for l in open("pairs_Rating.csv"):
    if l.startswith("userID"): # header
        predictions.write(l)
        continue
    u,b = l.strip().split(',') # Read the user and item from the "pairs" file and write out your prediction
    
    # 构建特征向量
    user_idx = user_to_idx.get(u, -1)
    item_idx = item_to_idx.get(b, -1)
    
    if user_idx == -1 or item_idx == -1:
        # 如果用户或物品不在训练集中，使用默认预测值
        prediction = model.intercept_
    else:
        # 创建一个稀疏特征向量
        feature_vector = numpy.zeros((1, len(users) + len(items)))
        feature_vector[0, user_idx] = 1
        feature_vector[0, len(users) + item_idx] = 1
        
        # 使用模型进行预测
        prediction = best_model.predict(feature_vector)[0]

    predictions.write(u + ',' + b + ',' + str(int(prediction)) + '\n')

predictions.close()

In [105]:
f = open("answers_hw3.txt", 'w')
f.write(str(answers) + '\n')
f.close()