In [1]:
import gzip
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import normalize, OneHotEncoder
from sklearn import svm
from sklearn.linear_model import Ridge
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import time
from scipy.sparse import lil_matrix, csr_matrix
from tqdm import tqdm

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        yield l.strip().split(',')

In [3]:
def matrix_factorization_rating_prediction(ratingsTrain, ratingsValid, n_factors=50, reg=0.1):
    # 构建用户和物品的索引映射
    users = list(set(u for u, _, _ in ratingsTrain))
    items = list(set(b for _, b, _ in ratingsTrain))
    user_to_idx = {u: i for i, u in enumerate(users)}
    item_to_idx = {b: i for i, b in enumerate(items)}
    
    n_users = len(users)
    n_items = len(items)
    
    # 构建评分矩阵
    R = np.zeros((n_users, n_items))
    for u, b, r in ratingsTrain:
        try:
            R[user_to_idx[u], item_to_idx[b]] = float(r)  # 转换为浮点数
        except ValueError:
            R[user_to_idx[u], item_to_idx[b]] = 0.0  # 处理无法转换的情况
    
    # 使用SVD进行矩阵分解
    svd = TruncatedSVD(n_components=n_factors, random_state=42)
    U = svd.fit_transform(R)
    Sigma = svd.singular_values_
    Vt = svd.components_
    
    # 预测评分
    R_pred = np.dot(U, np.dot(np.diag(Sigma), Vt))
    
    # 计算验证集的MSE
    y_true = []
    y_pred = []
    for u, b, r in ratingsValid:
        if u in user_to_idx and b in item_to_idx:
            pred = R_pred[user_to_idx[u], item_to_idx[b]]
        else:
            pred = np.mean(R[R > 0])  # 使用全局平均评分
        try:
            y_true.append(float(r))  # 转换为浮点数
        except ValueError:
            y_true.append(np.mean(R[R > 0]))  # 无法转换时使用全局平均评分
        y_pred.append(pred)
    
    mse = mean_squared_error(y_true, y_pred)
    print(f"Validation MSE: {mse}")
    return mse, R_pred


In [4]:
def feature_engineering_read_prediction(ratingsTrain, return1, ratingsPerUser, usersPerItem):
    # 构建特征和标签
    features = []
    labels = []
    user_avg_rating = defaultdict(float)
    item_count = defaultdict(int)
    user_interaction_count = defaultdict(int)
    
    # 首先计算每个用户的总评分和交互次数
    for u, b, r in ratingsTrain:
        try:
            user_avg_rating[u] += float(r)
            user_interaction_count[u] += 1
            item_count[b] += 1
        except (ValueError, TypeError):
            continue
    
    # 计算平均评分，避免除以零
    for u in user_avg_rating:
        if user_interaction_count[u] > 0:  # 只有当有交互时才计算平均值
            user_avg_rating[u] /= user_interaction_count[u]
        else:
            user_avg_rating[u] = 0.0  # 如果没有交互，设置默认值为0

    for u, b, r in tqdm(ratingsTrain):
        feature = []
        # 特征1：用户平均评分
        feature.append(user_avg_rating[u])
        
        # 特征2：物品被评分次数
        feature.append(item_count[b])
        
        # 特征3：用户-物品互动次数
        interaction_count = len([1 for _, item, _ in ratingsTrain if item == b and _ == u])
        feature.append(interaction_count)
        
        # 特征4：是否在热门物品中
        feature.append(1 if b in return1 else 0)
        
        # 特征5：用户与物品的Jaccard相似度
        user_books = ratingsPerUser[u]
        max_sim = max([jaccard_similarity(set(usersPerItem[b]), set(usersPerItem[read_book])) for read_book in user_books], default=0)
        feature.append(1 if max_sim > 0.01 else 0)
        
        features.append(feature)
        labels.append(1 if int(r) > 0 else 0)
    
    return np.array(features), np.array(labels)


In [5]:
def jaccard_similarity(s1, s2):
    intersection = len(s1 & s2)
    union = len(s1 | s2)
    return intersection / union if union != 0 else 0


In [14]:
def gradient_boosting_read_prediction(features, labels):
    X_train, X_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.1, random_state=42)
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'seed': 42
    }
    
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=100,
                    valid_sets=[lgb_train, lgb_valid])
    
    y_pred = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]
    accuracy = accuracy_score(y_valid, y_pred_binary)
    print(f"Validation Accuracy: {accuracy}")
    return gbm


In [7]:
def fit_bias_model_sklearn_sparse(train_data, valid_data, lambda_reg=1.0):
    # 构建用户和物品的索引映射
    users = list(set(u for u,_,_ in train_data))
    items = list(set(b for _,b,_ in train_data))
    user_to_idx = {u:i for i,u in enumerate(users)}
    item_to_idx = {b:i for i,b in enumerate(items)}
    
    n_users = len(users)
    n_items = len(items)
    
    # 使用稀疏矩阵构建训练数据
    X_train = lil_matrix((len(train_data), n_users + n_items))
    y_train = np.zeros(len(train_data))
    
    for i, (u,b,r) in enumerate(train_data):
        X_train[i, user_to_idx[u]] = 1  # 用户one-hot编码
        X_train[i, n_users + item_to_idx[b]] = 1  # 物品one-hot编码
        y_train[i] = r
    
    # 训练模型
    model = Ridge(alpha=lambda_reg, fit_intercept=True, solver='sag')
    model.fit(X_train, y_train)
    
    # 使用稀疏矩阵构建验证数据
    X_valid = lil_matrix((len(valid_data), n_users + n_items))
    y_valid = np.zeros(len(valid_data))
    
    for i, (u,b,r) in enumerate(valid_data):
        if u in user_to_idx and b in item_to_idx:
            X_valid[i, user_to_idx[u]] = 1
            X_valid[i, n_users + item_to_idx[b]] = 1
        y_valid[i] = r
    
    y_pred = model.predict(X_valid)
    valid_mse = mean_squared_error(y_valid, y_pred)
    
    return valid_mse, model.intercept_, model.coef_[:n_users], model.coef_[n_users:], model


In [8]:

start_time = time.time()
print("ratings start")
allRatings = []
userRatings = defaultdict(list)

for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)

for u, b, r in ratingsTrain:
    ratingsPerUser[u].append(b)
    ratingsPerItem[b].append((u, int(r)))
    usersPerItem[b].add(u)
    itemsPerUser[u].add(b)

# 计算最受欢迎的书籍
bookCount = defaultdict(int)
totalRead = 0

for user, book, _ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

mostPopular = sorted([(count, book) for book, count in bookCount.items()], reverse=True)

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalRead * 0.5:
        break

# 评分预测
print("开始评分预测模型训练...")
#mse, R_pred = matrix_factorization_rating_prediction(ratingsTrain, ratingsValid)


ratings start
开始评分预测模型训练...


In [9]:

# 阅读预测
print("开始阅读预测特征工程...")
features, labels = feature_engineering_read_prediction(ratingsTrain, return1, ratingsPerUser, usersPerItem)



开始阅读预测特征工程...


100%|██████████| 190000/190000 [16:06<00:00, 196.64it/s]


In [15]:
print("开始训练梯度提升决策树模型...")
gbm = gradient_boosting_read_prediction(features, labels)

开始训练梯度提升决策树模型...
[LightGBM] [Info] Number of positive: 161795, number of negative: 9205
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 460
[LightGBM] [Info] Number of data points in the train set: 171000, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.946170 -> initscore=2.866583
[LightGBM] [Info] Start training from score 2.866583
Validation Accuracy: 0.9552631578947368


In [16]:

# 评分预测示例
print("开始进行评分预测...")
users = list(set(u for u, _, _ in ratingsTrain))
items = list(set(b for _, b, _ in ratingsTrain))
user_to_idx = {u:i for i, u in enumerate(users)}
item_to_idx = {b:i for i, b in enumerate(items)}

encoder = OneHotEncoder()
encoder.fit([[u,b] for u, b, _ in ratingsTrain])

model_rating = Ridge(alpha=4.6, fit_intercept=True, solver='sag')
X_train_rating = encoder.transform([[u, b] for u, b, _ in ratingsTrain])
y_train_rating = np.array([r for _, _, r in ratingsTrain])
model_rating.fit(X_train_rating, y_train_rating)

with open("predictions_Rating.csv", 'w') as predictions_rating:
    for l in open("pairs_Rating.csv"):
        if l.startswith("userID"):
            predictions_rating.write(l)
            continue
        u, b = l.strip().split(',')
        
        try:
            # 使用与训练时相同的encoder进行转换
            feature_vector = encoder.transform([[u, b]])
            prediction = model_rating.predict(feature_vector)[0]
        except:
            # 处理未知用户或书籍的情况
            prediction = 3  # 使用默认评分
        
        # 确保评分在合理范围内
        prediction = max(1, min(5, prediction))
        
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")


开始进行评分预测...
评分预测完成，结果已保存到 predictions_Rating.csv


In [30]:
from itertools import groupby


print("开始进行评分预测...")

# 1. 预计算统计信息
print("计算统计信息...")
user_stats = {}
item_stats = {}
all_ratings = []

# 首先收集所有有效评分
for u, b, r in tqdm(allRatings, desc="Collecting ratings"):
    try:
        rating = float(r)
        if not np.isnan(rating):  # 确保评分是有效的数字
            all_ratings.append((u, b, rating))
    except (ValueError, TypeError):
        continue

# 转换为numpy数组
ratings_array = np.array(all_ratings)
users = ratings_array[:, 0]
items = ratings_array[:, 1]
ratings = ratings_array[:, 2].astype(float)  # 确保评分是float类型

global_mean = np.mean(ratings)

# 计算用户统计信息
for u, g in tqdm(groupby(sorted(zip(users, ratings)), key=lambda x: x[0]), desc="Computing user stats"):
    user_ratings = np.array([float(r) for _, r in g])
    user_stats[u] = {
        'mean': np.mean(user_ratings),
        'std': np.std(user_ratings) if len(user_ratings) > 1 else 0,
        'count': len(user_ratings)
    }

# 计算物品统计信息
for b, g in tqdm(groupby(sorted(zip(items, ratings)), key=lambda x: x[0]), desc="Computing item stats"):
    item_ratings = np.array([float(r) for _, r in g])
    item_stats[b] = {
        'mean': np.mean(item_ratings),
        'std': np.std(item_ratings) if len(item_ratings) > 1 else 0,
        'count': len(item_ratings)
    }

# 2. 基线预测模型优化
def baseline_predictor(user, item):
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        
        # 动态权重调整
        u_weight = 1 / (1 + np.exp(-u_stats['count']/10))  # sigmoid函数
        i_weight = 1 / (1 + np.exp(-i_stats['count']/10))
        
        # 考虑评分标准差
        u_reliability = 1 / (1 + u_stats['std'])
        i_reliability = 1 / (1 + i_stats['std'])
        
        # 组合预测
        weights_sum = u_weight * u_reliability + i_weight * i_reliability
        if weights_sum > 0:
            base_pred = (u_weight * u_reliability * u_stats['mean'] + 
                        i_weight * i_reliability * i_stats['mean']) / weights_sum
        else:
            base_pred = global_mean
    elif user in user_stats:
        base_pred = 0.7 * user_stats[user]['mean'] + 0.3 * global_mean
    elif item in item_stats:
        base_pred = 0.7 * item_stats[item]['mean'] + 0.3 * global_mean
    else:
        base_pred = global_mean
    return base_pred

# 3. 增强特征工程
def create_features(user, item):
    u_stats = user_stats.get(user, {'mean': global_mean, 'std': 0, 'count': 0})
    i_stats = item_stats.get(item, {'mean': global_mean, 'std': 0, 'count': 0})
    baseline = baseline_predictor(user, item)
    
    # 基础特征
    features = [
        u_stats['mean'] - global_mean,     # 用户评分偏差
        i_stats['mean'] - global_mean,     # 物品评分偏差
        np.log1p(u_stats['count']),        # 用户评分数量（对数）
        np.log1p(i_stats['count']),        # 物品评分数量（对数）
        baseline - global_mean,            # 基线预测偏差
        u_stats['std'],                    # 用户标准差
        i_stats['std'],                    # 物品标准差
    ]
    
    # 交互特征
    features.extend([
        (u_stats['mean'] * i_stats['mean']) / 25,  # 均值交互
        abs(u_stats['mean'] - i_stats['mean']),    # 均值差异
        u_stats['std'] * i_stats['std'],           # 标准差交互
        np.log1p(u_stats['count']) * np.log1p(i_stats['count']), # 评分数量交互
        1 / (1 + u_stats['std']),         # 用户可靠性
        1 / (1 + i_stats['std']),         # 物品可靠性
    ])
    
    return features

# 4. 模型训练（使用交叉验证选择最佳参数）
print("准备训练数据...")
X_train = np.array([create_features(u, b) for u, b, r in tqdm(zip(users, items, ratings), desc="Creating features")])
y_train = ratings - global_mean

# 标准化特征
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# 使用ElasticNet而不是Ridge
print("训练模型...")
from sklearn.linear_model import ElasticNet
model = ElasticNet(
    alpha=0.001,  # 较小的正则化参数
    l1_ratio=0.1, # 主要使用L2正则化
    max_iter=1000,
    random_state=42
)
model.fit(X_train_scaled, y_train)

# 5. 预测
print("开始预测...")
with open("predictions_Rating.csv", 'w') as predictions_rating:
    predictions_rating.write("userID,bookID,rating\n")
    
    test_pairs = []
    with open("pairs_Rating.csv") as f:
        next(f)
        for l in f:
            test_pairs.append(l.strip().split(','))
    
    for u, b in tqdm(test_pairs, desc="Predicting ratings"):
        try:
            features = create_features(u, b)
            features_scaled = scaler.transform([features])
            pred_diff = model.predict(features_scaled)[0]
            prediction = global_mean + pred_diff
            
            # 智能范围限制
            u_stats = user_stats.get(u, {'std': 1.0, 'mean': global_mean})
            i_stats = item_stats.get(b, {'std': 1.0, 'mean': global_mean})
            
            # 自适应预测范围
            base = baseline_predictor(u, b)
            confidence = 1 / (1 + u_stats['std'] + i_stats['std'])  # 可信度
            allowed_diff = (2.0 - confidence) * 1.5  # 可信度越高，允许的差异越小
            
            # 加权组合
            w = 0.7  # 模型预测的权重
            prediction = w * prediction + (1-w) * base
            prediction = max(base - allowed_diff, min(base + allowed_diff, prediction))
            
        except:
            prediction = baseline_predictor(u, b)
        
        prediction = max(1, min(5, prediction))
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")

开始进行评分预测...
计算统计信息...


Collecting ratings:  88%|████████▊ | 176404/200000 [00:00<00:00, 885551.99it/s]

Collecting ratings: 100%|██████████| 200000/200000 [00:00<00:00, 894411.82it/s]
Computing user stats: 27943it [00:00, 40503.13it/s]
Computing item stats: 6688it [00:00, 22381.17it/s]


准备训练数据...


Creating features: 200000it [00:02, 82573.52it/s]


训练模型...
开始预测...


Predicting ratings: 100%|██████████| 10000/10000 [00:01<00:00, 6847.63it/s]

评分预测完成，结果已保存到 predictions_Rating.csv





In [31]:
print("开始进行评分预测...")

# 1. 预计算统计信息
print("计算统计信息...")
user_stats = {}
item_stats = {}
all_ratings = []

# 首先收集所有有效评分
for u, b, r in tqdm(allRatings, desc="Collecting ratings"):
    try:
        rating = float(r)
        if not np.isnan(rating):  # 确保评分是有效的数字
            all_ratings.append((u, b, rating))
    except (ValueError, TypeError):
        continue

# 转换为numpy数组
ratings_array = np.array(all_ratings)
users = ratings_array[:, 0]
items = ratings_array[:, 1]
ratings = ratings_array[:, 2].astype(float)  # 确保评分是float类型

global_mean = np.mean(ratings)

# 计算用户统计信息
for u, g in tqdm(groupby(sorted(zip(users, ratings)), key=lambda x: x[0]), desc="Computing user stats"):
    user_ratings = np.array([float(r) for _, r in g])
    user_stats[u] = {
        'mean': np.mean(user_ratings),
        'std': np.std(user_ratings) if len(user_ratings) > 1 else 0,
        'count': len(user_ratings)
    }

# 计算物品统计信息
for b, g in tqdm(groupby(sorted(zip(items, ratings)), key=lambda x: x[0]), desc="Computing item stats"):
    item_ratings = np.array([float(r) for _, r in g])
    item_stats[b] = {
        'mean': np.mean(item_ratings),
        'std': np.std(item_ratings) if len(item_ratings) > 1 else 0,
        'count': len(item_ratings)
    }
# 2. 构建稀疏评分矩阵
print("构建评分矩阵...")
from scipy.sparse import csr_matrix

# 创建用户和物品的索引映射
unique_users = sorted(list(user_stats.keys()))
unique_items = sorted(list(item_stats.keys()))
user_to_idx = {u: i for i, u in enumerate(unique_users)}
item_to_idx = {i: j for j, i in enumerate(unique_items)}

# 构建稀疏矩阵
row = []
col = []
data = []
for u, b, r in zip(users, items, ratings):
    if u in user_to_idx and b in item_to_idx:
        row.append(user_to_idx[u])
        col.append(item_to_idx[b])
        # 去除全局和用户/物品偏差
        adj_rating = float(r) - global_mean
        adj_rating -= (user_stats[u]['mean'] - global_mean)
        adj_rating -= (item_stats[b]['mean'] - global_mean)
        data.append(adj_rating)

R = csr_matrix((data, (row, col)), shape=(len(unique_users), len(unique_items)))

# 3. 使用截断SVD学习隐含特征
print("训练SVD模型...")
from sklearn.decomposition import TruncatedSVD
n_factors = 30  # 隐含特征数量
svd = TruncatedSVD(n_components=n_factors, random_state=42)
user_factors = svd.fit_transform(R)
item_factors = svd.components_

# 4. 基线预测模型
def baseline_predictor(user, item):
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        
        # 计算可靠性权重
        u_conf = np.tanh(u_stats['count'] / 10)  # 使用tanh函数
        i_conf = np.tanh(i_stats['count'] / 10)
        
        # 计算偏差
        u_bias = (u_stats['mean'] - global_mean) * u_conf
        i_bias = (i_stats['mean'] - global_mean) * i_conf
        
        # 加权预测
        pred = global_mean + 0.7 * u_bias + 0.7 * i_bias
        
        # 如果可以，添加矩阵分解预测
        try:
            u_idx = user_to_idx.get(user)
            i_idx = item_to_idx.get(item)
            if u_idx is not None and i_idx is not None:
                mf_pred = np.dot(user_factors[u_idx], item_factors[:, i_idx])
                # 根据用户和物品的评分数量调整矩阵分解的权重
                mf_weight = min(0.3, (u_conf + i_conf) / 4)
                pred = (1 - mf_weight) * pred + mf_weight * (mf_pred + global_mean)
        except:
            pass
        
        return pred
    elif user in user_stats:
        return 0.8 * user_stats[user]['mean'] + 0.2 * global_mean
    elif item in item_stats:
        return 0.8 * item_stats[item]['mean'] + 0.2 * global_mean
    return global_mean

# 5. 预测
print("开始预测...")
with open("predictions_Rating.csv", 'w') as predictions_rating:
    predictions_rating.write("userID,bookID,rating\n")
    
    test_pairs = []
    with open("pairs_Rating.csv") as f:
        next(f)
        for l in f:
            test_pairs.append(l.strip().split(','))
    
    for u, b in tqdm(test_pairs, desc="Predicting ratings"):
        try:
            # 获取基线预测
            base_pred = baseline_predictor(u, b)
            
            # 获取用户和物品的统计信息
            u_stats = user_stats.get(u, {'std': 1.0, 'mean': global_mean, 'count': 0})
            i_stats = item_stats.get(b, {'std': 1.0, 'mean': global_mean, 'count': 0})
            
            # 计算可信度
            u_conf = np.tanh(u_stats['count'] / 10)
            i_conf = np.tanh(i_stats['count'] / 10)
            reliability = (u_conf + i_conf) / 2
            
            # 根据可信度调整预测范围
            allowed_diff = 2.0 * (1 - reliability)
            prediction = base_pred
            
            # 确保预测在合理范围内
            prediction = max(base_pred - allowed_diff, min(base_pred + allowed_diff, prediction))
            prediction = max(1, min(5, prediction))
            
        except:
            prediction = global_mean
            prediction = max(1, min(5, prediction))
        
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")

开始进行评分预测...
计算统计信息...


Collecting ratings: 100%|██████████| 200000/200000 [00:00<00:00, 784379.38it/s]
Computing user stats: 27943it [00:00, 38952.26it/s]
Computing item stats: 6688it [00:00, 22944.85it/s]


构建评分矩阵...
训练SVD模型...
开始预测...


Predicting ratings: 100%|██████████| 10000/10000 [00:00<00:00, 62799.61it/s]

评分预测完成，结果已保存到 predictions_Rating.csv





In [17]:

# 阅读预测示例
print("开始进行阅读预测...")
# 构建测试集特征
test_features = []
test_users = []
test_books = []

with open("pairs_Read.csv", 'r') as test_file:
    for l in test_file:
        if l.startswith("userID"):
            continue
        u, b = l.strip().split(',')
        feature = []
        # 特征1：用户平均评分
        user_avg = np.mean([r for _, _, r in ratingsTrain if _ == u]) if u in ratingsPerUser else 0
        feature.append(user_avg)
        
        # 特征2：物品被评分次数
        item_cnt = len(ratingsPerItem[b]) if b in ratingsPerItem else 0
        feature.append(item_cnt)
        
        # 特征3：用户-物品互动次数
        interaction_cnt = len([1 for _, item, _ in ratingsTrain if item == b and _ == u]) if u in ratingsPerUser else 0
        feature.append(interaction_cnt)
        
        # 特征4：是否在热门物品中
        feature.append(1 if b in return1 else 0)
        
        # 特征5：用户与物品的Jaccard相似度
        if u in ratingsPerUser and b in usersPerItem:
            user_books = set(ratingsPerUser[u])
            sim = max([jaccard_similarity(usersPerItem[b], usersPerItem[read_book]) for read_book in user_books], default=0)
            feature.append(1 if sim > 0.01 else 0)
        else:
            feature.append(0)
        
        test_features.append(feature)
        test_users.append(u)
        test_books.append(b)

test_features = np.array(test_features)
read_predictions = gbm.predict(test_features)
read_predictions_binary = [1 if pred > 0.5 else 0 for pred in read_predictions]

with open("predictions_Read.csv", 'w') as predictions_read:
    predictions_read.write("userID,bookID,prediction\n")
    for u, b, pred in zip(test_users, test_books, read_predictions_binary):
        predictions_read.write(f"{u},{b},{pred}\n")

print("阅读预测完成，结果已保存到 predictions_Read.csv")

开始进行阅读预测...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


阅读预测完成，结果已保存到 predictions_Read.csv


In [32]:
# 2. 多个基线预测器
def baseline1(user, item):
    """基于加权平均的基线预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        w_u = np.tanh(u_stats['count'] / 10)
        w_i = np.tanh(i_stats['count'] / 10)
        pred = global_mean + \
               w_u * (u_stats['mean'] - global_mean) + \
               w_i * (i_stats['mean'] - global_mean)
        return pred
    elif user in user_stats:
        return 0.8 * user_stats[user]['mean'] + 0.2 * global_mean
    elif item in item_stats:
        return 0.8 * item_stats[item]['mean'] + 0.2 * global_mean
    return global_mean

def baseline2(user, item):
    """基于可靠性的基线预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        u_reliability = 1 / (1 + u_stats['std'])
        i_reliability = 1 / (1 + i_stats['std'])
        weights_sum = u_reliability + i_reliability
        if weights_sum > 0:
            pred = (u_reliability * u_stats['mean'] + 
                   i_reliability * i_stats['mean']) / weights_sum
            return 0.8 * pred + 0.2 * global_mean
    return global_mean

def baseline3(user, item):
    """基于评分数量的指数衰减预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        w_u = 1 - np.exp(-u_stats['count'] / 20)
        w_i = 1 - np.exp(-i_stats['count'] / 20)
        pred = ((w_u * u_stats['mean'] + w_i * i_stats['mean']) / 
                (w_u + w_i) if (w_u + w_i) > 0 else global_mean)
        return pred
    return global_mean

# 3. 预测集成
def ensemble_predict(user, item):
    # 获取各个基线预测
    pred1 = baseline1(user, item)
    pred2 = baseline2(user, item)
    pred3 = baseline3(user, item)
    
    # 获取用户和物品的统计信息
    u_stats = user_stats.get(user, {'std': 1.0, 'mean': global_mean, 'count': 0})
    i_stats = item_stats.get(item, {'std': 1.0, 'mean': global_mean, 'count': 0})
    
    # 计算可信度
    u_conf = np.tanh(u_stats['count'] / 10)
    i_conf = np.tanh(i_stats['count'] / 10)
    reliability = (u_conf + i_conf) / 2
    
    # 根据可信度调整权重
    if reliability > 0.8:
        # 高可信度时更信任baseline1
        weights = [0.5, 0.25, 0.25]
    elif reliability > 0.5:
        # 中等可信度时平均权重
        weights = [0.4, 0.3, 0.3]
    else:
        # 低可信度时更信任baseline2和baseline3
        weights = [0.2, 0.4, 0.4]
    
    # 加权平均
    prediction = (weights[0] * pred1 + 
                 weights[1] * pred2 + 
                 weights[2] * pred3)
    
    # 自适应范围限制
    allowed_diff = 1.5 * (1 - reliability)
    base = (u_stats['mean'] + i_stats['mean']) / 2
    prediction = max(base - allowed_diff, min(base + allowed_diff, prediction))
    
    return prediction

# 4. 预测
print("开始预测...")
with open("predictions_Rating.csv", 'w') as predictions_rating:
    predictions_rating.write("userID,bookID,rating\n")
    
    test_pairs = []
    with open("pairs_Rating.csv") as f:
        next(f)
        for l in f:
            test_pairs.append(l.strip().split(','))
    
    for u, b in tqdm(test_pairs, desc="Predicting ratings"):
        try:
            prediction = ensemble_predict(u, b)
            
            # 最终范围限制
            prediction = max(1, min(5, prediction))
            
        except:
            # 使用最保守的预测
            prediction = global_mean
            prediction = max(1, min(5, prediction))
        
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")

开始预测...


Predicting ratings: 100%|██████████| 10000/10000 [00:00<00:00, 74126.05it/s]

评分预测完成，结果已保存到 predictions_Rating.csv





In [36]:
print("计算扩展统计信息...")
rating_distribution = {}
for r in ratings:
    r = int(round(r))
    rating_distribution[r] = rating_distribution.get(r, 0) + 1

total_ratings = sum(rating_distribution.values())
rating_probs = {r: count/total_ratings for r, count in rating_distribution.items()}

# 计算每个用户的评分分布
user_rating_dist = defaultdict(lambda: defaultdict(int))
for u, r in zip(users, ratings):
    r = int(round(float(r)))
    user_rating_dist[u][r] += 1

def baseline1(user, item):
    """基于加权平均的基线预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        w_u = np.tanh(u_stats['count'] / 10)
        w_i = np.tanh(i_stats['count'] / 10)
        pred = global_mean + \
               w_u * (u_stats['mean'] - global_mean) + \
               w_i * (i_stats['mean'] - global_mean)
        return pred
    elif user in user_stats:
        return 0.85 * user_stats[user]['mean'] + 0.15 * global_mean
    elif item in item_stats:
        return 0.85 * item_stats[item]['mean'] + 0.15 * global_mean
    return global_mean

def baseline2(user, item):
    """基于标准差的可靠性预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        u_reliability = 1 / (1 + u_stats['std'])
        i_reliability = 1 / (1 + i_stats['std'])
        weights_sum = u_reliability + i_reliability
        if weights_sum > 0:
            pred = (u_reliability * u_stats['mean'] + 
                   i_reliability * i_stats['mean']) / weights_sum
            return 0.85 * pred + 0.15 * global_mean
    return global_mean

def baseline3(user, item):
    """基于评分数量的指数衰减预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        w_u = 1 - np.exp(-u_stats['count'] / 15)
        w_i = 1 - np.exp(-i_stats['count'] / 15)
        pred = ((w_u * u_stats['mean'] + w_i * i_stats['mean']) / 
                (w_u + w_i) if (w_u + w_i) > 0 else global_mean)
        return pred
    return global_mean

def baseline4(user, item):
    """保守预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        # 向全局均值收缩
        shrinkage = 0.85
        u_pred = shrinkage * u_stats['mean'] + (1 - shrinkage) * global_mean
        i_pred = shrinkage * i_stats['mean'] + (1 - shrinkage) * global_mean
        return (u_pred + i_pred) / 2
    return global_mean

def ensemble_predict(user, item):
    # 获取所有基线预测
    preds = [
        baseline1(user, item),
        baseline2(user, item),
        baseline3(user, item),
        baseline4(user, item)
    ]
    
    # 获取统计信息
    u_stats = user_stats.get(user, {'std': 1.0, 'mean': global_mean, 'count': 0})
    i_stats = item_stats.get(item, {'std': 1.0, 'mean': global_mean, 'count': 0})
    
    # 计算可信度
    rating_confidence = np.tanh((u_stats['count'] + i_stats['count']) / 20)
    std_confidence = 1 / (1 + u_stats['std'] + i_stats['std'])
    
    # 综合可信度
    reliability = (rating_confidence + std_confidence) / 2
    
    # 根据可信度分配权重
    if reliability > 0.8:
        weights = [0.35, 0.25, 0.2, 0.2]  # 高可信度时更信任baseline1
    elif reliability > 0.5:
        weights = [0.3, 0.3, 0.2, 0.2]    # 中等可信度时平均权重
    else:
        weights = [0.2, 0.3, 0.25, 0.25]  # 低可信度时更信任其他预测器
    
    # 加权平均
    prediction = sum(w * p for w, p in zip(weights, preds))
    
    # 自适应范围限制
    allowed_diff = 1.2 * (1 - reliability)
    base = (u_stats['mean'] + i_stats['mean']) / 2
    prediction = max(base - allowed_diff, min(base + allowed_diff, prediction))
    
    return max(1, min(5, prediction))

# 4. 预测（保持不变）
print("开始预测...")
with open("predictions_Rating.csv", 'w') as predictions_rating:
    predictions_rating.write("userID,bookID,prediction\n")
    
    test_pairs = []
    with open("pairs_Rating.csv") as f:
        next(f)
        for l in f:
            test_pairs.append(l.strip().split(','))
    
    for u, b in tqdm(test_pairs, desc="Predicting ratings"):
        try:
            prediction = ensemble_predict(u, b)
        except:
            prediction = global_mean
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")

计算扩展统计信息...
开始预测...


Predicting ratings: 100%|██████████| 10000/10000 [00:00<00:00, 68694.78it/s]

评分预测完成，结果已保存到 predictions_Rating.csv





In [41]:
def baseline1(user, item):
    """基于加权平均的基线预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        w_u = np.tanh(u_stats['count'] / 9)  # 微调阈值
        w_i = np.tanh(i_stats['count'] / 11)
        pred = global_mean + \
               w_u * (u_stats['mean'] - global_mean) + \
               w_i * (i_stats['mean'] - global_mean)
        return pred
    elif user in user_stats:
        return 0.87 * user_stats[user]['mean'] + 0.13 * global_mean
    elif item in item_stats:
        return 0.83 * item_stats[item]['mean'] + 0.17 * global_mean
    return global_mean

def baseline2(user, item):
    """基于标准差的可靠性预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        u_reliability = 1 / (1 + 0.9 * u_stats['std'])
        i_reliability = 1 / (1 + 0.9 * i_stats['std'])
        weights_sum = u_reliability + i_reliability
        if weights_sum > 0:
            pred = (u_reliability * u_stats['mean'] + 
                   i_reliability * i_stats['mean']) / weights_sum
            return 0.86 * pred + 0.14 * global_mean
    return global_mean

def baseline3(user, item):
    """基于评分数量的指数衰减预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        w_u = 1 - np.exp(-u_stats['count'] / 14)
        w_i = 1 - np.exp(-i_stats['count'] / 16)
        pred = ((w_u * u_stats['mean'] + w_i * i_stats['mean']) / 
                (w_u + w_i) if (w_u + w_i) > 0 else global_mean)
        return pred
    return global_mean

def baseline4(user, item):
    """保守预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        shrinkage = 0.84  # 微调收缩率
        u_pred = shrinkage * u_stats['mean'] + (1 - shrinkage) * global_mean
        i_pred = shrinkage * i_stats['mean'] + (1 - shrinkage) * global_mean
        return (u_pred + i_pred) / 2
    return global_mean

# ... (保持其他预测器不变)

def ensemble_predict(user, item):
    preds = [
        baseline1(user, item),
        baseline2(user, item),
        baseline3(user, item),
        baseline4(user, item)
    ]
    
    u_stats = user_stats.get(user, {'std': 1.0, 'mean': global_mean, 'count': 0})
    i_stats = item_stats.get(item, {'std': 1.0, 'mean': global_mean, 'count': 0})
    
    # 调整可信度计算
    rating_confidence = np.tanh((u_stats['count'] + i_stats['count']) / 19)
    std_confidence = 1 / (1 + 0.95 * (u_stats['std'] + i_stats['std']))
    
    # 添加均值差异的影响，但权重很小
    mean_diff = abs(u_stats['mean'] - i_stats['mean']) / 5
    mean_confidence = 1 - mean_diff
    
    # 综合可信度计算
    reliability = (0.5 * rating_confidence + 0.4 * std_confidence + 0.1 * mean_confidence)
    
    # 更细致的权重分配
    if reliability > 0.85:  # 极高可信度
        weights = [0.4, 0.25, 0.175, 0.175]
    elif reliability > 0.7:  # 高可信度
        weights = [0.35, 0.25, 0.2, 0.2]
    elif reliability > 0.5:  # 中等可信度
        weights = [0.3, 0.3, 0.2, 0.2]
    else:  # 低可信度
        weights = [0.2, 0.3, 0.25, 0.25]
    
    prediction = sum(w * p for w, p in zip(weights, preds))
    
    # 根据可信度调整范围限制
    if reliability > 0.85:
        allowed_diff = 1.0 * (1 - reliability)  # 极高可信度时更严格的限制
    else:
        allowed_diff = 1.15 * (1 - reliability)
    
    base = (u_stats['mean'] + i_stats['mean']) / 2
    prediction = max(base - allowed_diff, min(base + allowed_diff, prediction))
    
    return max(1, min(5, prediction))

In [42]:
# 4. 预测（保持不变）
print("开始预测...")
with open("predictions_Rating.csv", 'w') as predictions_rating:
    predictions_rating.write("userID,bookID,prediction\n")
    
    test_pairs = []
    with open("pairs_Rating.csv") as f:
        next(f)
        for l in f:
            test_pairs.append(l.strip().split(','))
    
    for u, b in tqdm(test_pairs, desc="Predicting ratings"):
        try:
            prediction = ensemble_predict(u, b)
        except:
            prediction = global_mean
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")

开始预测...


Predicting ratings: 100%|██████████| 10000/10000 [00:00<00:00, 68965.30it/s]

评分预测完成，结果已保存到 predictions_Rating.csv





In [43]:
import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
from tqdm import tqdm

print("开始进行评分预测...")

# 1. 数据预处理
print("预处理数据...")
user_ratings = defaultdict(list)
item_ratings = defaultdict(list)
rating_matrix_data = []

# 收集评分数据
for u, b, r in tqdm(allRatings):
    try:
        rating = float(r)
        user_ratings[u].append(rating)
        item_ratings[b].append(rating)
        rating_matrix_data.append((u, b, rating))
    except:
        continue

# 2. 计算统计特征
print("计算统计特征...")
global_mean = np.mean([r for _, _, r in rating_matrix_data])

user_stats = {}
item_stats = {}

for u, ratings in user_ratings.items():
    user_stats[u] = {
        'mean': np.mean(ratings),
        'std': np.std(ratings) if len(ratings) > 1 else 0,
        'count': len(ratings),
        'median': np.median(ratings)
    }

for b, ratings in item_ratings.items():
    item_stats[b] = {
        'mean': np.mean(ratings),
        'std': np.std(ratings) if len(ratings) > 1 else 0,
        'count': len(ratings),
        'median': np.median(ratings)
    }

# 3. 构建评分矩阵
print("构建评分矩阵...")
users = sorted(list(user_ratings.keys()))
items = sorted(list(item_ratings.keys()))
user_to_idx = {u: i for i, u in enumerate(users)}
item_to_idx = {i: j for j, i in enumerate(items)}

rows, cols, data = [], [], []
for u, b, r in rating_matrix_data:
    if u in user_to_idx and b in item_to_idx:
        rows.append(user_to_idx[u])
        cols.append(item_to_idx[b])
        data.append(float(r))

rating_matrix = csr_matrix((data, (rows, cols)), shape=(len(users), len(items)))

# 4. 矩阵分解
print("执行矩阵分解...")
n_components = 50
svd = TruncatedSVD(n_components=n_components, random_state=42)
user_features = svd.fit_transform(rating_matrix)
item_features = svd.components_.T

# 5. 特征工程
def create_features(user, item, rating=None):
    features = {}
    
    # 用户特征
    u_stats = user_stats.get(user, {'mean': global_mean, 'std': 0, 'count': 0, 'median': global_mean})
    features['user_mean'] = u_stats['mean']
    features['user_std'] = u_stats['std']
    features['user_count'] = np.log1p(u_stats['count'])
    features['user_median'] = u_stats['median']
    
    # 物品特征
    i_stats = item_stats.get(item, {'mean': global_mean, 'std': 0, 'count': 0, 'median': global_mean})
    features['item_mean'] = i_stats['mean']
    features['item_std'] = i_stats['std']
    features['item_count'] = np.log1p(i_stats['count'])
    features['item_median'] = i_stats['median']
    
    # 交互特征
    features['mean_diff'] = abs(u_stats['mean'] - i_stats['mean'])
    features['std_sum'] = u_stats['std'] + i_stats['std']
    features['count_interaction'] = np.log1p(u_stats['count'] * i_stats['count'])
    
    # 矩阵分解特征
    try:
        u_idx = user_to_idx.get(user)
        i_idx = item_to_idx.get(item)
        if u_idx is not None and i_idx is not None:
            mf_pred = np.dot(user_features[u_idx], item_features[i_idx])
            features['mf_pred'] = mf_pred
            
            # 添加部分潜在因子作为特征
            for i in range(5):  # 使用前5个潜在因子
                features[f'user_factor_{i}'] = user_features[u_idx][i]
                features[f'item_factor_{i}'] = item_features[i_idx][i]
    except:
        features['mf_pred'] = global_mean
        for i in range(5):
            features[f'user_factor_{i}'] = 0
            features[f'item_factor_{i}'] = 0
    
    return features

# 6. 准备训练数据
print("准备训练数据...")
train_features = []
train_labels = []

for u, b, r in tqdm(rating_matrix_data):
    features = create_features(u, b, r)
    train_features.append(features)
    train_labels.append(float(r))

# 转换为DataFrame
import pandas as pd
train_df = pd.DataFrame(train_features)
train_labels = np.array(train_labels)

# 7. 训练模型
print("训练模型...")
params = {
    'objective': 'regression',
    'metric': 'mse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

model = lgb.LGBMRegressor(**params)
model.fit(train_df, train_labels)

# 8. 预测
print("开始预测...")
with open("predictions_Rating.csv", 'w') as predictions_rating:
    predictions_rating.write("userID,bookID,prediction\n")
    
    test_pairs = []
    with open("pairs_Rating.csv") as f:
        next(f)
        for l in f:
            test_pairs.append(l.strip().split(','))
    
    for u, b in tqdm(test_pairs):
        try:
            features = create_features(u, b)
            features_df = pd.DataFrame([features])
            prediction = model.predict(features_df)[0]
            
            # 智能范围限制
            u_stats = user_stats.get(u, {'mean': global_mean, 'std': 1.0})
            i_stats = item_stats.get(b, {'mean': global_mean, 'std': 1.0})
            
            base = (u_stats['mean'] + i_stats['mean']) / 2
            allowed_diff = 1.0 + 0.5 * (u_stats['std'] + i_stats['std']) / 2
            
            prediction = max(base - allowed_diff, min(base + allowed_diff, prediction))
            prediction = max(1, min(5, prediction))
            
        except:
            prediction = global_mean
            prediction = max(1, min(5, prediction))
        
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")

开始进行评分预测...
预处理数据...


100%|██████████| 190000/190000 [00:00<00:00, 370754.44it/s]


计算统计特征...
构建评分矩阵...
执行矩阵分解...
准备训练数据...


100%|██████████| 190000/190000 [00:02<00:00, 78556.83it/s]


训练模型...
开始预测...


100%|██████████| 10000/10000 [00:09<00:00, 1016.98it/s]

评分预测完成，结果已保存到 predictions_Rating.csv





In [45]:
import numpy as np
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
from tqdm import tqdm

print("开始进行评分预测...")

# 1. 数据预处理（保持不变）
print("预处理数据...")
user_ratings = defaultdict(list)
item_ratings = defaultdict(list)
rating_matrix_data = []

for u, b, r in tqdm(allRatings):
    try:
        rating = float(r)
        user_ratings[u].append(rating)
        item_ratings[b].append(rating)
        rating_matrix_data.append((u, b, rating))
    except:
        continue

# 2. 计算统计特征
print("计算统计特征...")
global_mean = np.mean([r for _, _, r in rating_matrix_data])

user_stats = {}
item_stats = {}

for u, ratings in user_ratings.items():
    ratings_array = np.array(ratings)
    user_stats[u] = {
        'mean': np.mean(ratings),
        'std': np.std(ratings) if len(ratings) > 1 else 0,
        'count': len(ratings),
        'median': np.median(ratings),
        'min': np.min(ratings),
        'max': np.max(ratings),
        'q25': np.percentile(ratings, 25),
        'q75': np.percentile(ratings, 75)
    }

for b, ratings in item_ratings.items():
    ratings_array = np.array(ratings)
    item_stats[b] = {
        'mean': np.mean(ratings),
        'std': np.std(ratings) if len(ratings) > 1 else 0,
        'count': len(ratings),
        'median': np.median(ratings),
        'min': np.min(ratings),
        'max': np.max(ratings),
        'q25': np.percentile(ratings, 25),
        'q75': np.percentile(ratings, 75)
    }

# 3. 构建评分矩阵
print("构建评分矩阵...")
users = sorted(list(user_ratings.keys()))
items = sorted(list(item_ratings.keys()))
user_to_idx = {u: i for i, u in enumerate(users)}
item_to_idx = {i: j for j, i in enumerate(items)}

rows, cols, data = [], [], []
for u, b, r in rating_matrix_data:
    if u in user_to_idx and b in item_to_idx:
        rows.append(user_to_idx[u])
        cols.append(item_to_idx[b])
        data.append(float(r) - global_mean)  # 中心化

rating_matrix = csr_matrix((data, (rows, cols)), shape=(len(users), len(items)))

# 4. 矩阵分解
print("执行矩阵分解...")
n_components = 30  # 减少组件数量
svd = TruncatedSVD(n_components=n_components, random_state=42)
user_features = svd.fit_transform(rating_matrix)
item_features = svd.components_.T

# 5. 基线预测器
def baseline_predictor(user, item):
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        
        # 计算可靠性权重
        u_weight = np.tanh(u_stats['count'] / 10)
        i_weight = np.tanh(i_stats['count'] / 10)
        
        # 考虑评分范围
        u_range = u_stats['max'] - u_stats['min']
        i_range = i_stats['max'] - i_stats['min']
        range_weight = 1 / (1 + 0.1 * (u_range + i_range))
        
        pred = global_mean + \
               u_weight * (u_stats['mean'] - global_mean) + \
               i_weight * (i_stats['mean'] - global_mean)
        
        return pred * range_weight + global_mean * (1 - range_weight)
    elif user in user_stats:
        return 0.8 * user_stats[user]['mean'] + 0.2 * global_mean
    elif item in item_stats:
        return 0.8 * item_stats[item]['mean'] + 0.2 * global_mean
    return global_mean

# 6. 特征工程
def create_features(user, item, rating=None):
    features = {}
    
    # 基线预测
    features['baseline_pred'] = baseline_predictor(user, item)
    
    # 用户特征
    u_stats = user_stats.get(user, {
        'mean': global_mean, 'std': 0, 'count': 0, 'median': global_mean,
        'min': global_mean, 'max': global_mean, 'q25': global_mean, 'q75': global_mean
    })
    
    features['user_mean'] = u_stats['mean']
    features['user_std'] = u_stats['std']
    features['user_count'] = np.log1p(u_stats['count'])
    features['user_range'] = u_stats['max'] - u_stats['min']
    features['user_iqr'] = u_stats['q75'] - u_stats['q25']
    
    # 物品特征
    i_stats = item_stats.get(item, {
        'mean': global_mean, 'std': 0, 'count': 0, 'median': global_mean,
        'min': global_mean, 'max': global_mean, 'q25': global_mean, 'q75': global_mean
    })
    
    features['item_mean'] = i_stats['mean']
    features['item_std'] = i_stats['std']
    features['item_count'] = np.log1p(i_stats['count'])
    features['item_range'] = i_stats['max'] - i_stats['min']
    features['item_iqr'] = i_stats['q75'] - i_stats['q25']
    
    # 矩阵分解特征
    try:
        u_idx = user_to_idx.get(user)
        i_idx = item_to_idx.get(item)
        if u_idx is not None and i_idx is not None:
            mf_pred = np.dot(user_features[u_idx], item_features[i_idx]) + global_mean
            features['mf_pred'] = mf_pred
    except:
        features['mf_pred'] = global_mean
    
    return features

# 7. 训练模型
print("准备训练数据...")
train_features = []
train_labels = []

for u, b, r in tqdm(rating_matrix_data):
    features = create_features(u, b, r)
    train_features.append(features)
    train_labels.append(float(r))

import pandas as pd
train_df = pd.DataFrame(train_features)
train_labels = np.array(train_labels)

print("训练模型...")
params = {
    'objective': 'regression',
    'metric': 'mse',
    'num_leaves': 16,  # 减少叶子节点
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'min_child_samples': 20,  # 增加最小样本数
    'reg_alpha': 0.1,  # L1正则化
    'reg_lambda': 0.1,  # L2正则化
    'verbose': -1
}

model = lgb.LGBMRegressor(**params, n_estimators=100)
model.fit(train_df, train_labels)

# 8. 预测
print("开始预测...")
with open("predictions_Rating.csv", 'w') as predictions_rating:
    predictions_rating.write("userID,bookID,rating\n")
    
    test_pairs = []
    with open("pairs_Rating.csv") as f:
        next(f)
        for l in f:
            test_pairs.append(l.strip().split(','))
    
    for u, b in tqdm(test_pairs):
        try:
            features = create_features(u, b)
            features_df = pd.DataFrame([features])
            
            # 获取基线和模型预测
            baseline = features['baseline_pred']
            model_pred = model.predict(features_df)[0]
            
            # 加权组合预测
            w_model = 0.3  # 降低模型权重
            prediction = w_model * model_pred + (1 - w_model) * baseline
            
            # 范围限制
            u_stats = user_stats.get(u, {'mean': global_mean, 'std': 1.0})
            i_stats = item_stats.get(b, {'mean': global_mean, 'std': 1.0})
            
            base = (u_stats['mean'] + i_stats['mean']) / 2
            allowed_diff = 0.8  # 收紧范围限制
            
            prediction = max(base - allowed_diff, min(base + allowed_diff, prediction))
            prediction = max(1, min(5, prediction))
            
        except:
            prediction = global_mean
            prediction = max(1, min(5, prediction))
        
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")

开始进行评分预测...
预处理数据...


100%|██████████| 200000/200000 [00:00<00:00, 891702.18it/s]


计算统计特征...


构建评分矩阵...
执行矩阵分解...
准备训练数据...


100%|██████████| 200000/200000 [00:02<00:00, 84533.03it/s]


训练模型...
开始预测...


100%|██████████| 10000/10000 [00:09<00:00, 1038.23it/s]

评分预测完成，结果已保存到 predictions_Rating.csv





In [49]:
import numpy as np
from collections import defaultdict
from surprise import Dataset, Reader, SVDpp, SVD, KNNWithMeans, SlopeOne
from surprise.model_selection import GridSearchCV
from tqdm import tqdm
import pandas as pd

print("开始进行评分预测...")

# 1. 数据预处理
print("预处理数据...")
training_data = []
for u, b, r in tqdm(ratingsTrain):
    try:
        training_data.append([u, b, float(r)])
    except:
        continue

# 2. 计算统计信息
user_ratings = defaultdict(list)
item_ratings = defaultdict(list)
global_mean = np.mean([r for _, _, r in training_data])

for u, b, r in training_data:
    user_ratings[u].append(r)
    item_ratings[b].append(r)

user_stats = {}
for u, ratings in user_ratings.items():
    ratings = np.array(ratings)
    user_stats[u] = {
        'mean': np.mean(ratings),
        'std': np.std(ratings) if len(ratings) > 1 else 0,
        'count': len(ratings),
        'median': np.median(ratings),
        'q25': np.percentile(ratings, 25),
        'q75': np.percentile(ratings, 75)
    }

item_stats = {}
for b, ratings in item_ratings.items():
    ratings = np.array(ratings)
    item_stats[b] = {
        'mean': np.mean(ratings),
        'std': np.std(ratings) if len(ratings) > 1 else 0,
        'count': len(ratings),
        'median': np.median(ratings),
        'q25': np.percentile(ratings, 25),
        'q75': np.percentile(ratings, 75)
    }

# 3. 基线预测器
def baseline1(user, item):
    """基于加权平均的预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        w_u = np.tanh(u_stats['count'] / 10)
        w_i = np.tanh(i_stats['count'] / 10)
        pred = global_mean + \
               w_u * (u_stats['mean'] - global_mean) + \
               w_i * (i_stats['mean'] - global_mean)
        return pred
    elif user in user_stats:
        return 0.85 * user_stats[user]['mean'] + 0.15 * global_mean
    elif item in item_stats:
        return 0.85 * item_stats[item]['mean'] + 0.15 * global_mean
    return global_mean

def baseline2(user, item):
    """基于四分位数的预测器"""
    if user in user_stats and item in item_stats:
        u_stats = user_stats[user]
        i_stats = item_stats[item]
        u_iqr = u_stats['q75'] - u_stats['q25']
        i_iqr = i_stats['q75'] - i_stats['q25']
        reliability = 1 / (1 + 0.5 * (u_iqr + i_iqr))
        pred = reliability * (u_stats['median'] + i_stats['median']) / 2 + \
               (1 - reliability) * global_mean
        return pred
    return global_mean

# 4. 创建Surprise数据格式
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(pd.DataFrame(training_data, columns=['user', 'item', 'rating']), reader)
trainset = data.build_full_trainset()

# 5. 训练多个模型
print("训练模型...")
# SVD++
svdpp = SVDpp(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02)
svdpp.fit(trainset)

# 普通SVD
svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
svd.fit(trainset)

# KNN
knn = KNNWithMeans(k=40, min_k=1, sim_options={'name': 'pearson_baseline', 'user_based': False})
knn.fit(trainset)

# SlopeOne
slopeone = SlopeOne()
slopeone.fit(trainset)

# 6. 预测函数
def get_prediction(user, item):
    try:
        # 获取各个模型的预测
        pred_svdpp = svdpp.predict(user, item).est
        pred_svd = svd.predict(user, item).est
        pred_knn = knn.predict(user, item).est
        pred_slopeone = slopeone.predict(user, item).est
        pred_base1 = baseline1(user, item)
        pred_base2 = baseline2(user, item)
        
        # 获取统计信息
        u_stats = user_stats.get(user, {'mean': global_mean, 'std': 1.0, 'count': 0})
        i_stats = item_stats.get(item, {'mean': global_mean, 'std': 1.0, 'count': 0})
        
        # 计算可信度
        rating_conf = np.tanh((u_stats['count'] + i_stats['count']) / 20)
        std_conf = 1 / (1 + u_stats['std'] + i_stats['std'])
        reliability = (rating_conf + std_conf) / 2
        
        # 根据可信度分配权重
        if reliability > 0.8:
            weights = [0.3, 0.2, 0.1, 0.1, 0.2, 0.1]  # 高可信度时更信任SVD++
        elif reliability > 0.5:
            weights = [0.25, 0.15, 0.15, 0.15, 0.15, 0.15]  # 中等可信度时平均分配
        else:
            weights = [0.2, 0.1, 0.1, 0.1, 0.25, 0.25]  # 低可信度时更信任基线预测器
        
        # 加权平均
        predictions = [pred_svdpp, pred_svd, pred_knn, pred_slopeone, pred_base1, pred_base2]
        pred = sum(w * p for w, p in zip(weights, predictions))
        
        # 智能范围限制
        base = (u_stats['mean'] + i_stats['mean']) / 2
        allowed_diff = 0.8 + 0.2 * (1 / (1 + u_stats['std'] + i_stats['std']))
        pred = max(base - allowed_diff, min(base + allowed_diff, pred))
        
        return max(1, min(5, pred))
    
    except:
        return global_mean

# 7. 生成预测
print("开始预测...")
with open("predictions_Rating.csv", 'w') as predictions_rating:
    predictions_rating.write("userID,bookID,prediction\n")
    
    test_pairs = []
    with open("pairs_Rating.csv") as f:
        next(f)
        for l in f:
            test_pairs.append(l.strip().split(','))
    
    for u, b in tqdm(test_pairs):
        try:
            prediction = get_prediction(u, b)
        except:
            prediction = global_mean
        
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")

开始进行评分预测...
预处理数据...


  0%|          | 0/190000 [00:00<?, ?it/s]

100%|██████████| 190000/190000 [00:00<00:00, 490925.72it/s]


训练模型...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
开始预测...


100%|██████████| 10000/10000 [00:00<00:00, 17905.71it/s]

评分预测完成，结果已保存到 predictions_Rating.csv





In [None]:
import numpy as np
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import GridSearchCV
from tqdm import tqdm
import pandas as pd

print("开始进行评分预测...")

# 1. 数据预处理
print("预处理数据...")
training_data = []
for u, b, r in tqdm(ratingsTrain):
    try:
        training_data.append([u, b, float(r)])
    except:
        continue

# 2. 创建Surprise数据格式
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(pd.DataFrame(training_data, columns=['user', 'item', 'rating']), reader)

# 3. 网格搜索最佳参数
print("搜索最佳参数...")
param_grid = {
    'n_factors': [50, 100, 150],  # 潜在因子数量
    'n_epochs': [30, 40],         # 训练轮数
    'lr_all': [0.005, 0.007],     # 学习率
    'reg_all': [0.02, 0.1],       # 正则化参数
    'init_mean': [0],             # 初始化均值
    'init_std_dev': [0.1]         # 初始化标准差
}

gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], 
                 cv=3, n_jobs=-1, joblib_verbose=0)
gs.fit(data)

print(f"最佳参数: {gs.best_params['rmse']}")
print(f"最佳RMSE: {gs.best_score['rmse']}")

# 4. 使用最佳参数训练最终模型
print("训练最终模型...")
best_params = gs.best_params['rmse']
model = SVDpp(
    n_factors=best_params['n_factors'],
    n_epochs=best_params['n_epochs'],
    lr_all=best_params['lr_all'],
    reg_all=best_params['reg_all'],
    init_mean=best_params['init_mean'],
    init_std_dev=best_params['init_std_dev'],
    random_state=42,
    verbose=False
)

# 在完整训练集上训练
trainset = data.build_full_trainset()
model.fit(trainset)

# 5. 预测函数
def get_prediction(user, item):
    try:
        prediction = model.predict(user, item).est
        return max(1, min(5, prediction))  # 确保预测在1-5范围内
    except:
        return 3.0  # 如果预测失败，返回中间值

# 6. 生成预测
print("开始预测...")
with open("predictions_Rating.csv", 'w') as predictions_rating:
    predictions_rating.write("userID,bookID,rating\n")
    
    test_pairs = []
    with open("pairs_Rating.csv") as f:
        next(f)
        for l in f:
            test_pairs.append(l.strip().split(','))
    
    for u, b in tqdm(test_pairs):
        prediction = get_prediction(u, b)
        predictions_rating.write(f"{u},{b},{int(round(prediction))}\n")

print("评分预测完成，结果已保存到 predictions_Rating.csv")