In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import tqdm


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [4]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [5]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [6]:
answers = {}

In [7]:
# Some data structures that will be useful

In [8]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [9]:
len(allRatings)

200000

In [61]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
userPerItem = defaultdict(set)
itemPerUser = defaultdict(set)
for u,b,r in allRatings:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    userPerItem[b].add(u)
    itemPerUser[u].add(b)

In [11]:
##################################################
# Read prediction                                #
##################################################

In [55]:
# Copied from baseline code
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalRead*0.7: break

In [13]:
# in[12] 定义负采样函数
def sample_negative(user, positive_books, all_books, num_samples=1):
    negative_samples = []
    while len(negative_samples) < num_samples:
        neg_book = random.choice(all_books)
        if neg_book not in positive_books:
            negative_samples.append(neg_book)
    return negative_samples

In [14]:
# in[13] 构建验证集
validation_positive = ratingsValid
validation_negative = []
all_books = list(bookCount.keys())

user_positive_books = defaultdict(set)
for u, b, r in ratingsTrain:
    user_positive_books[u].add(b)

for u, b, r in validation_positive:
    negative_books = sample_negative(u, user_positive_books[u], all_books)
    for neg_b in negative_books:
        validation_negative.append((u, neg_b, 0))

In [15]:
# 合并正负样本
validation_set = [(u, b, 1) for u, b, r in validation_positive] + validation_negative

# in[14] 评估基线模型的准确率
def evaluate_baseline(validation_set, mostPopular_set):
    correct = 0
    for u, b, label in validation_set:
        prediction = 1 if b in mostPopular_set else 0
        if prediction == label:
            correct += 1
    accuracy = correct / len(validation_set)
    return accuracy

acc1 = evaluate_baseline(validation_set, return1)
answers['Q1'] = acc1

In [16]:
### Question 1

In [17]:
answers['Q1'] = acc1

In [18]:
print(answers['Q1'])

0.71295


In [19]:
assertFloat(answers['Q1'])

In [20]:
### Question 2

In [21]:
# in[16] 寻找最佳阈值
def find_best_threshold(book_counts, total_read, validation_set):
    thresholds = sorted(book_counts.values())
    best_acc = 0
    best_threshold = thresholds[0]
    for threshold in thresholds:
        popular_set = set([book for book, count in book_counts.items() if count >= threshold])
        acc = evaluate_baseline(validation_set, popular_set)
        if acc > best_acc:
            best_acc = acc
            best_threshold = threshold
    return best_threshold, best_acc

threshold, acc2 = find_best_threshold(bookCount, totalRead, validation_set)
answers['Q2'] = [threshold, acc2]

In [None]:
return_threshold = set()
count = 0

threshold = 0.01*threshold
for ic, i in mostPopular:
    count += ic
    return_threshold.add(i)
    if count > totalRead*threshold: break

In [22]:
print(answers['Q2'])

[35, 0.7586]


In [23]:
answers['Q2'] = [threshold, acc2]

In [24]:
assertFloat(answers['Q2'][0])
assertFloat(answers['Q2'][1])

In [25]:
### Question 3/4

In [59]:
def jaccard_similarity(s1, s2):
    intersection = s1 & s2
    union = s1 | s2
    if not union:
        return 0
    return len(intersection) / len(union)

In [71]:
def evaluate_jaccard(validation_set, ratingsPerUser, ratingsPerItem, threshold):
    correct = 0
    for u, b, label in validation_set:
        read_items = [item for item, r in ratingsPerUser[u]]
        similarities = [jaccard_similarity(userPerItem[b], userPerItem[b_prime]) for b_prime in read_items]
        max_similarity = max(similarities) if similarities else 0
        prediction = 1 if max_similarity > threshold else 0
        if prediction == 0:
            users = [user for user, r in ratingsPerItem[b]]
            user_similarities = [jaccard_similarity(itemPerUser[b], itemPerUser[user]) for user in users]
            max_user_similarity = max(user_similarities) if user_similarities else 0
            prediction = 1 if max_user_similarity > threshold else 0
        popularity_pred = 1 if b in return1 else 0
        prediction = 1 if max(popularity_pred, prediction) else 0
        if prediction == label:
            correct += 1
    accuracy = correct / len(validation_set)
    return accuracy

In [78]:
def find_best_jaccard_threshold(validation_set, ratingsPerUser, ratingsPerItem):
    thresholds = [i * 0.001 for i in range(110, 116,1)]
    best_acc = 0
    best_threshold = 0
    for threshold in tqdm.tqdm(thresholds):
        acc = evaluate_jaccard(validation_set, ratingsPerUser, ratingsPerItem, threshold)
        if acc > best_acc:
            best_acc = acc
            best_threshold = threshold
    return best_threshold, best_acc

threshold_jaccard, acc3 = find_best_jaccard_threshold(validation_set, ratingsPerUser, ratingsPerItem)
answers['Q3'] = acc3

  0%|          | 0/6 [00:00<?, ?it/s]

100%|██████████| 6/6 [00:16<00:00,  2.76s/it]


In [79]:
print(threshold_jaccard,acc3)

0.112 0.9083


In [50]:
# in[24] 结合 Jaccard 和流行度阈值
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
def evaluate_combined(validation_set, mostPopular_set, ratingsPerUser, ratingsPerItem, jaccard_threshold):
    correct = 0
    features = []
    labels = []
    for u, b, label in validation_set:
        popularity_pred = 1 if b in mostPopular_set else 0
        read_items = [item for item, r in ratingsPerUser[u]]
        similarities = [jaccard_similarity(b, b_prime, ratingsPerItem) for b_prime in read_items]
        if similarities:
            jaccard_pred = 1 if max(similarities) > jaccard_threshold else 0
        else:
            jaccard_pred = 0

        users = [user for user, r in ratingsPerItem[b]]
        user_similarity = [jaccard_similarity(u, u_prime, ratingsPerUser) for u_prime in users]
        max_user_similarity = int(max(user_similarity)>jaccard_threshold) if user_similarity else 0
        user_pred = 1 if max_user_similarity > jaccard_threshold else 0 

        features.append([popularity_pred, jaccard_pred, user_pred])
        labels.append(label)
    df_features = pd.DataFrame(features, columns=['popularity', 'similarity', 'user_similarity'])
    df_labels = pd.Series(labels, name='label')
    X_train, X_val, y_train, y_val = train_test_split(df_features, df_labels, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy,clf

# 找到最佳组合阈值
def find_best_combined_threshold(validation_set, mostPopular_set, ratingsPerUser, ratingsPerItem):
    thresholds = [i * 0.01 for i in range(0, 21)]
    best_acc = 0
    best_threshold = 0
    for threshold in tqdm.tqdm(thresholds):
        acc = evaluate_combined(validation_set, mostPopular_set, ratingsPerUser, ratingsPerItem, threshold)
        if acc > best_acc:
            best_acc = acc
            best_threshold = threshold
    return best_threshold, best_acc

#threshold_combined, acc4 = find_best_combined_threshold(validation_set, return_threshold, ratingsPerUser, ratingsPerItem)
answers['Q4'] = acc4
print(threshold_combined,acc4)

0.09 1.0


In [43]:
print(threshold_combined,acc4)

0.09 1.0


In [51]:
_,clf = evaluate_combined(validation_set, return_threshold, ratingsPerUser, ratingsPerItem, threshold_combined)
with open("predictions_Read.csv", 'w') as predictions:
    for l in open("pairs_Read.csv"):
        if l.startswith("userID"):
            predictions.write(l)
            continue
        u, b = l.strip().split(',')
        # 使用结合后的模型进行预测
        popularity_pred = 1 if b in return_threshold else 0

        read_items = [item for item, r in ratingsPerUser[u]]
        similarities = [jaccard_similarity(b, b_prime, ratingsPerItem) for b_prime in read_items]
        max_similarity = max(similarities) if similarities else 0
        jaccard_pred = 1 if max_similarity > threshold_combined else 0

        users = [user for user, r in ratingsPerItem[b]]
        user_similarity = [jaccard_similarity(u, u_prime, ratingsPerUser) for u_prime in users]
        max_user_similarity = max(user_similarity) if user_similarity else 0 
        user_pred = 1 if max_user_similarity > threshold_combined else 0

        prediction = clf.predict([[popularity_pred, jaccard_pred, user_pred]])
        predictions.write(f"{u},{b},{prediction[0]}\n")


In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 构建特征和标签
features = []
labels = []
for u, b, label in validation_set:
    # 流行度特征
    popularity = 1 if b in return_threshold else 0
    # 相似度特征
    read_items = [item for item, r in ratingsPerUser[u]]
    similarities = [jaccard_similarity(b, b_prime, ratingsPerItem) for b_prime in read_items]
    similarity = max(similarities) if similarities else 0
    # 添加到特征列表
    features.append([popularity, similarity])
    labels.append(label)

# 转换为 DataFrame
df_features = pd.DataFrame(features, columns=['popularity', 'similarity'])
df_labels = pd.Series(labels, name='label')

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(df_features, df_labels, test_size=0.2, random_state=42)

# 训练随机森林分类器
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 在验证集上进行预测
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"集成方法验证集准确率: {accuracy:.4f}")

# 更新答案字典
answers['Q4_ensemble'] = accuracy

集成方法验证集准确率: 1.0000


In [54]:
print(clf.predict([[1, 0.9,0]]))

[1]


In [43]:
print(threshold_combined,acc4)

0.03 0.75025


In [44]:
answers['Q3'] = acc3
answers['Q4'] = acc4

In [45]:
assertFloat(answers['Q3'])
assertFloat(answers['Q4'])

In [56]:
# in[26] 生成 predictions_Read.csv
with open("predictions_Read.csv", 'w') as predictions:
    for l in open("pairs_Read.csv"):
        if l.startswith("userID"):
            predictions.write(l)
            continue
        u, b = l.strip().split(',')
        # 使用结合后的模型进行预测
        popularity_pred = 1 if b in return1 else 0
        read_items = [item for item, r in ratingsPerUser[u]]
        similarities = [jaccard_similarity(b, b_prime, ratingsPerItem) for b_prime in read_items]
        if similarities:
            jaccard_pred = 1 if max(similarities) > 0.09 else 0
        else:
            jaccard_pred = 0
        prediction = 1 if max(popularity_pred, jaccard_pred) else 0
        predictions.write(f"{u},{b},{prediction}\n")

# 将答案提交到 Gradescope
answers['Q5'] = "I confirm that I have uploaded an assignment submission to gradescope"

In [48]:
assert type(answers['Q5']) == str

In [49]:
##################################################
# Rating prediction                              #
##################################################