In [7]:
import random

## 读取数据

In [17]:
max_user_id,max_item_it = 0,0
records = []
#训练集比例
train_ratio = 0.9
with open('ratings.dat',encoding='ISO-8859-1') as f:
    for line in f:
        tks = line.strip().split('::')
        max_user_id = max(max_user_id,int(tks[0]))
        max_item_it = max(max_item_it,int(tks[1]))
        #以tuple的形式存储每一条评分记录
        records.append((int(tks[0])-1,int(tks[1])-1,int(tks[2])))
print("最大用户ID:{0}，最大Item ID:{1}. 总记录数：{2}".format(max_user_id,max_item_it,len(records)))
#打乱数据集顺序
random.shuffle(records)
#按一定比例划分测试集和训练集
train_list = records[0:int(len(records)* train_ratio)]
test_list = records[int(len(records)* train_ratio):]

最大用户ID:6040，最大Item ID:3952. 总记录数：1000209


## 把数据包装为torch数据集类型

In [14]:
import torch
import numpy as np
from torch.utils import data

In [15]:
class Dataset(data.Dataset):
    #输入是rating list : [(user,item,rate),...]
    def __init__(self,rating_list,n_user,n_item,user_based = True):
        self.data = rating_list
        self.user_based = user_based
        self.n_user = n_user
        self.n_item = n_item
        #构建共现矩阵
        self.x_mat = np.zeros((n_user, n_item))
        #mask矩阵，有评分记为1否则为0
        self.mask = np.zeros_like(self.x_mat)
        #三元组列表存到矩阵中
        for u,i,o in self.data:
            self.x_mat[u][i] = o
            self.mask[u][i] = 1
        #转为torch.Tensor
        self.x_mat = torch.from_numpy(self.x_mat).float() 
        self.mask = torch.from_numpy(self.mask).float() 
        #如果使用item-based协同过滤
        if not user_based:
            self.x_mat = self.x_mat.t()
            self.mask = self.mask.t()
    #重写getitem,len方法：
        #作用：当实例对象P做P[key]运算时，会自动调用__getitem__方法
    def __getitem__(self,idx):
        return self.x_mat[idx],self.mask[idx]
    def __len__(self):
        if self.user_based: return self.n_user
        return self.n_item
    def get_matrix(self):
        return self.x_mat, self.mask, self.user_based

In [173]:
train_data = Dataset(train_list,max_user_id,max_item_it,False)

In [174]:
train_data.get_matrix()[0].size()

torch.Size([3952, 6040])

In [85]:
max_user_id

6040

## 构建AutoEncoder模块

In [169]:
from torch import nn,optim
import torch.nn.functional as F

class AutoEncoder(nn.Module):
    def __init__(self,input_size,hidden_size,drop_out = 0.1):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size,hidden_size), 
            nn.Dropout(drop_out),
            nn.Sigmoid()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_size,input_size), 
            nn.Dropout(drop_out),
            nn.Sigmoid()
        )
    #前向传播,此处将评分归一化
    def forward(self,X):
        X = (X-1)/4
        X = self.decoder(self.encoder(X))
        X = torch.clamp(X,0,1.0)
        return X*4.0 + 1

In [192]:
batch_size = 128
#输入是max_used_id * max_item_id大小的Tensor
net = AutoEncoder( max_user_id, 300)
net.train()

if(train_data.user_based):
    feature_size = max_item_it
else:
    feature_size = max_user_id
    
def train(net, optim, train_loader, epoch):
    features = torch.zeros(batch_size, feature_size)
    masks = torch.zeros(batch_size, feature_size)
    
    for _, (feature, mask) in enumerate(train_loader):
        #这里是为了利用batch_size不是128的数据
        if mask.shape[0] == batch_size:
            features.copy_(feature)
            masks.copy_(mask)
        else:
            features = feature
            masks = mask
    #mask记录共现矩阵中哪里是1（即有评分的位置），用矩阵相乘可以加速计算
    optim.zero_grad()
    #AE恢复出的数据
    output = net(features)
    loss = F.mse_loss(output * masks, features * masks)
    loss.backward()
    optim.step()

def test(net,train_data,test_data):
    #输入全量矩阵
    x_mat = train_data.get_matrix()[0]
    #输出恢复的矩阵
    xc  = net(x_mat)
    if not train_data.user_based:
        xc = xc.t()
    xc = xc.detach().numpy()
    #用训练集恢复的数据看和测试集的差距
    rmse = 0.0
    for i,j,k in test_data:
        rmse += (xc[i][j] - k)**2
    rmse = np.sqrt(rmse/len(test_data))
    print('rmse:', rmse.item())
    
def run(net, train_data,test_data, num_epoch, batch_size = batch_size):
    #首先对net进行初始化，采用xavier_uniform
    def init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
    net.apply(init_weights)
    #优化器：带动量的SGD
    optimizer = optim.SGD(net.parameters(), 0.2, momentum= 0.9, weight_decay= 1e-4)
    print("模型初始化参数完毕，开始训练")
    for epoch in range(1, num_epoch + 1):
        print('epoch: ' + str(epoch) + '/ ' +  str(num_epoch))
        train_loader = data.DataLoader(train_data, batch_size, shuffle=True)
        train(net, optimizer, train_loader, epoch)
        test(net,train_data, test_data)
        

In [194]:
run(net,train_data=j,test_data=test_list,num_epoch=50)

模型初始化参数完毕，开始训练
epoch: 1/ 50
rmse: 1.2692345392809132
epoch: 2/ 50
rmse: 1.2691351272014193
epoch: 3/ 50
rmse: 1.2690953027890337
epoch: 4/ 50
rmse: 1.2688918430179572
epoch: 5/ 50
rmse: 1.2685372216717046
epoch: 6/ 50
rmse: 1.268306313346322
epoch: 7/ 50
rmse: 1.2681253249475997
epoch: 8/ 50
rmse: 1.2677526955403762
epoch: 9/ 50
rmse: 1.2673948085114728
epoch: 10/ 50
rmse: 1.2671293250611437
epoch: 11/ 50
rmse: 1.2666144894085214
epoch: 12/ 50
rmse: 1.266371010036745
epoch: 13/ 50
rmse: 1.2655811019113627
epoch: 14/ 50
rmse: 1.2650658245782285
epoch: 15/ 50
rmse: 1.264842017368138
epoch: 16/ 50
rmse: 1.2641258012301526
epoch: 17/ 50
rmse: 1.2635871454219045
epoch: 18/ 50
rmse: 1.2633759129735416
epoch: 19/ 50
rmse: 1.2628902132123394
epoch: 20/ 50
rmse: 1.2623090766659821
epoch: 21/ 50
rmse: 1.261911394258746
epoch: 22/ 50
rmse: 1.2614363215996913
epoch: 23/ 50
rmse: 1.260931955739916
epoch: 24/ 50
rmse: 1.2602818556509503
epoch: 25/ 50
rmse: 1.2599431507353347
epoch: 26/ 50
rmse: 1.25