In [13]:
# 1  导入相关的库（4分）
from surprise import Dataset, Reader
from surprise import KNNBasic
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn as sk
plt.rcParams['font.family'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
from sklearn.model_selection import train_test_split

# 2  自行定义基于物品或者用户推荐的函数（4分）
def user_base_func(user_id, x_train,y_train, testset, k=10):
    # 实例化KNNBasic算法，并设置基于用户的相似度选项  
    # 这将计算用户之间的相似度分数  
    model = KNNBasic(sim_options={'user_based': True})
    # 计算用户和物品之间的相似度 
    model.fit(x_train,y_train)
    # 从训练集中构建一个反测试集，用于进行预测  
    # 反测试集包含指定用户没有评分的所有物品  
    # testset = x_train,y_train.build_anti_testset()
    # 通过传入的testset测试算法，以获取用户的预测评分  
    predictions = model.test(testset)
    # 过滤预测结果，只保留指定用户的预测  
    user_predictions = [pred for pred in predictions if pred.uid == user_id]
    # 根据估计的评分以降序方式对用户的预测进行排序  # 这会将物品从最推荐到最不推荐进行排名  
    user_predictions.sort(key=lambda x: x.est, reverse=True)
    # 返回用户的前k个推荐物品的物品ID列表  
    return [pred.iid for pred in user_predictions[:k]], predictions


# 基于物品推荐的函数
def item_base_func(item_id, x_train,y_train, testset, k=10):
    # 实例化KNNBasic算法，并设置基于用户的相似度选项  
    # 这将计算用户之间的相似度分数  
    model = KNNBasic(sim_options={'user_based': False})  #改关键参数  
    # 计算用户和物品之间的相似度 
    model.fit(x_train,y_train)
    # 在反测试集上测试算法，以获取物品的预测评分  
    predictions = model.test(testset)
    # 过滤预测结果，只保留指定物品的预测  
    item_predictions = [pred for pred in predictions if pred.iid == item_id]
    # 根据估计的评分以降序方式对物品的预测进行排序  # 这会将用户从最推荐到最不推荐进行排名  
    item_predictions.sort(key=lambda x: x.est, reverse=True)
    # 返回用户的前k个推荐物品的用户ID列表  
    return [pred.uid for pred in item_predictions[:k]], predictions

In [18]:
import pandas as pd

a1 = pd.read_csv("E:///python代码/lx-yk/data-yk/yk2/sample_movielens_ratings.txt", sep='::', engine='python',names=['用户', '电影', '打分', '打分时间'])
print(a1)

      用户  电影  打分        打分时间
0      0   2   3  1424380312
1      0   3   1  1424380312
2      0   5   2  1424380312
3      0   9   4  1424380312
4      0  11   1  1424380312
...   ..  ..  ..         ...
1496  29  90   4  1424380312
1497  29  93   1  1424380312
1498  29  94   4  1424380312
1499  29  97   1  1424380312
1500  29  99   1  1424380312

[1501 rows x 4 columns]


In [24]:
rating_matrix = a1.pivot_table(index='用户', columns='电影', values='打分')

rating_matrix.isnull().sum()
rating_matrix=rating_matrix.fillna(0)
print(rating_matrix)

电影   0    1    2    3    4    5    6    7    8    9   ...   90   91   92   93  \
用户                                                    ...                       
0   0.0  0.0  3.0  1.0  0.0  2.0  0.0  0.0  0.0  4.0  ...  0.0  3.0  4.0  0.0   
1   0.0  0.0  2.0  1.0  2.0  0.0  1.0  0.0  0.0  3.0  ...  0.0  1.0  2.0  1.0   
2   0.0  0.0  0.0  0.0  3.0  0.0  1.0  0.0  5.0  1.0  ...  1.0  0.0  4.0  5.0   
3   1.0  1.0  1.0  0.0  0.0  0.0  0.0  3.0  3.0  1.0  ...  0.0  1.0  0.0  0.0   
4   0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  1.0  ...  0.0  0.0  0.0  0.0   
5   1.0  1.0  0.0  0.0  1.0  1.0  0.0  0.0  1.0  3.0  ...  4.0  2.0  0.0  0.0   
6   1.0  1.0  3.0  0.0  0.0  1.0  1.0  0.0  0.0  1.0  ...  0.0  2.0  0.0  0.0   
7   0.0  1.0  2.0  1.0  1.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0  2.0  0.0   
8   1.0  0.0  4.0  2.0  2.0  1.0  0.0  1.0  0.0  1.0  ...  1.0  0.0  2.0  0.0   
9   0.0  0.0  3.0  1.0  1.0  1.0  1.0  5.0  0.0  1.0  ...  3.0  0.0  0.0  0.0   
10  3.0  0.0  4.0  0.0  3.0 

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

item_similarity_matrix = cosine_similarity(rating_matrix.T)
print(item_similarity_matrix)

[[1.         0.38301617 0.70156076 ... 0.36109269 0.43519414 0.39774756]
 [0.38301617 1.         0.33071891 ... 0.46423835 0.43082022 0.20833333]
 [0.70156076 0.33071891 1.         ... 0.45621056 0.37219368 0.37796447]
 ...
 [0.36109269 0.46423835 0.45621056 ... 1.         0.22857516 0.37139068]
 [0.43519414 0.43082022 0.37219368 ... 0.22857516 1.         0.46159309]
 [0.39774756 0.20833333 0.37796447 ... 0.37139068 0.46159309 1.        ]]


In [25]:
user_similarity_matrix = cosine_similarity(rating_matrix)
print(user_similarity_matrix)

[[1.         0.45528831 0.38560539 0.2720037  0.31983015 0.3683129
  0.43566659 0.36335261 0.41485479 0.34899904 0.27724886 0.31945277
  0.39451833 0.3028326  0.32850708 0.39386318 0.3717176  0.3149187
  0.27974891 0.33389486 0.38510531 0.40528187 0.35972321 0.29492753
  0.37991988 0.44381607 0.236439   0.3511042  0.4314555  0.27106466]
 [0.45528831 1.         0.323326   0.34451443 0.42592367 0.41879187
  0.4154972  0.42293442 0.48331136 0.37037555 0.37420681 0.2757359
  0.32593471 0.42350675 0.41160649 0.26490647 0.38257837 0.31098316
  0.33263102 0.34235128 0.4262088  0.34641202 0.44637218 0.27707964
  0.34653853 0.36483827 0.49854439 0.26625622 0.39901194 0.31753027]
 [0.38560539 0.323326   1.         0.36360512 0.35058105 0.2502096
  0.24569325 0.31404128 0.27728175 0.26003301 0.37061666 0.31031093
  0.22149894 0.41962647 0.31872838 0.24410577 0.18433358 0.21762133
  0.4159336  0.22316954 0.3038137  0.28730055 0.23434585 0.26909052
  0.28523853 0.32018175 0.22756569 0.41302266 0.41

In [26]:
def predict_item_based(user_id, item_id):
    rated_items = rating_matrix.loc[user_id].dropna().index
    similarities = item_similarity_matrix[rating_matrix.columns.get_loc(item_id)]
    numerator = 0
    denominator = 0
    for rated_item in rated_items:
        similarity = similarities[rating_matrix.columns.get_loc(rated_item)]
        rating = rating_matrix.loc[user_id, rated_item]
        numerator += rating * similarity
        denominator += similarity
    if denominator == 0:
        return None
    return numerator / denominator

In [27]:
def predict_user_based(user_id, item_id):
    similar_users = user_similarity_matrix[user_id].dropna().index
    numerator = 0
    denominator = 0
    for similar_user in similar_users:
        similarity = user_similarity_matrix[user_id][similar_user]
        rating = rating_matrix.loc[similar_user, item_id]
        if pd.notna(rating):
            numerator += rating * similarity
            denominator += similarity
    if denominator == 0:
        return None
    return numerator / denominator

In [28]:
def recommend_item_based(user_id):
    unrated_items = rating_matrix.columns[rating_matrix.loc[user_id].isna()]
    recommendations = []
    for item_id in unrated_items:
        prediction = predict_item_based(user_id, item_id)
        if prediction is not None:
            recommendations.append((item_id, prediction))
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return [item_id for item_id, prediction in recommendations[:10]]

In [29]:
def recommend_user_based(user_id):
    unrated_items = rating_matrix.columns[rating_matrix.loc[user_id].isna()]
    recommendations = []
    for item_id in unrated_items:
        prediction = predict_user_based(user_id, item_id)
        if prediction is not None:
            recommendations.append((item_id, prediction))
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return [item_id for item_id, prediction in recommendations[:10]]

In [30]:
from sklearn.model_selection import train_test_split
from math import sqrt

train_data, test_data = train_test_split(a1, test_size=0.2)
train_matrix = train_data.pivot_table(index='用户', columns='电影', values='打分')
test_matrix = test_data.pivot_table(index='用户', columns='电影', values='打分')

def evaluate_item_based():
    errors = []
    for _, row in test_data.iterrows():
        user_id = row['用户']
        item_id = row['电影']
        actual_rating = row['打分']
        prediction = predict_item_based(user_id, item_id)
        if prediction is not None:
            error = (actual_rating - prediction) ** 2
            errors.append(error)
    if len(errors) == 0:
        return None
    return sqrt(sum(errors) / len(errors))

In [31]:
def evaluate_user_based():
    errors = []
    for _, row in test_data.iterrows():
        user_id = row['用户']
        item_id = row['电影']
        actual_rating = row['打分']
        prediction = predict_user_based(user_id, item_id)
        if prediction is not None:
            error = (actual_rating - prediction) ** 2
            errors.append(error)
    if len(errors) == 0:
        return None
    return sqrt(sum(errors) / len(errors))

In [32]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def cost_function(X, y, theta):
    m = len(y)
    h = sigmoid(X.dot(theta))
    cost = (-1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
    return cost

def gradient_descent(X, y, theta, alpha, iterations):
    m = len(y)
    costs = []
    for _ in range(iterations):
        h = sigmoid(X.dot(theta))
        error = h - y
        gradient = X.T.dot(error) / m
        theta = theta - alpha * gradient
        cost = cost_function(X, y, theta)
        costs.append(cost)
    return theta, costs

In [33]:
# 假设 X 是特征矩阵，y 是标签向量
X = np.array([[1, 2], [2, 3], [3, 4]])
y = np.array([0, 1, 1])
theta = np.zeros(X.shape[1])
alpha = 0.01
iterations = 1000
theta, costs = gradient_descent(X, y, theta, alpha, iterations)
print("最终的参数：", theta)

最终的参数： [ 0.8106436  -0.12011231]
