In [125]:
import os
import sys

import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils import data

## Data Loader

In [126]:
train = pd.read_csv('../Data/UserRec/rating_train.csv')
valid = pd.read_csv('../Data/UserRec/rating_validation.csv')
test = pd.read_csv('../Data/UserRec/rating_test.csv')

In [127]:
train_table = train.pivot_table(index='userId', columns='movieId', values='rating')
valid_table = valid.pivot_table(index='userId', columns='movieId', values='rating')
test_table = test.pivot_table(index='userId', columns='movieId', values='rating')

In [128]:
def load_data(data_path, train_ratio=0.9):
    max_uid = 0
    max_vid = 0
    records = []

    if not os.path.exists(data_path):
        print('[Error] File %s not found!' % data_path)
        sys.exit(-1)

    first_line_flag = True

    with open(data_path,encoding = "ISO-8859-1") as f:
        for line in f:
            #user,item,rating,m = line.split()
            tks = line.strip().split('::')#把数据变成一个list
            #tks = m
            if first_line_flag:
                max_uid = int(tks[0])
                max_vid = int(tks[1])
                first_line_flag = False
                continue
            max_uid = max(max_uid, int(tks[0]))
            max_vid = max(max_vid, int(tks[1]))
            records.append((int(tks[0]) - 1, int(tks[1]) - 1, int(tks[2])))
    print("Max user ID {0}. Max item ID {1}. In total {2} ratings.".format(
        max_uid, max_vid, len(records)))
    np.random.shuffle(records)
    train_list = records[0:int(len(records)*train_ratio)]
    test_list = records[int(len(records)*train_ratio):]
    return train_list, test_list, max_uid, max_vid

## Set Data

In [129]:
# class Dataset(data.Dataset):
#     def __init__(self, rating_list, n_user, n_item, user_based=True):
#         self.data = rating_list.to_numpy()
#         self.user_based = user_based
#         self.n_user = n_user
#         self.n_item = n_item
#         self.x_mat = rating_list.pivot_table(index='userId', columns='movieId', values='rating').to_numpy()
#         self.mask = rating_list.pivot_table(index='userId', columns='movieId', values='rating').notnull().astype("int").to_numpy()
#         self.x_mat = torch.from_numpy(self.x_mat).float()
#         self.mask = torch.from_numpy(self.mask).float()
#         if not self.user_based:
#             self.x_mat = self.x_mat.t()
#             self.mask = self.mask.t()

#     def __getitem__(self, index):
#         return self.x_mat[index], self.mask[index]

#     def __len__(self):
#         if self.user_based:
#             return self.n_user
#         return self.n_item

#     def get_mat(self):
#         return self.x_mat, self.mask, self.user_based


import numpy as np
import torch
from torch.utils import data

class Dataset(data.Dataset):
    def __init__(self, rating_list, n_user, n_item, user_based=True):
        self.data = rating_list
        self.user_based = user_based
        self.n_user = n_user
        self.n_item = n_item
        self.x_mat = np.ones((n_user, n_item)) * 0
        self.mask = np.zeros((n_user, n_item))
        for u, v, r in self.data:
            self.x_mat[int(u)][int(v)] = r
            self.mask[int(u)][int(v)] = 1
        self.x_mat = torch.from_numpy(self.x_mat).float()
        self.mask = torch.from_numpy(self.mask).float()
        if not self.user_based:
            self.x_mat = self.x_mat.t()
            self.mask = self.mask.t()

    def __getitem__(self, index):
        return self.x_mat[index], self.mask[index]

    def __len__(self):
        if self.user_based:
            return self.n_user
        return self.n_item

    def get_mat(self):
        return self.x_mat, self.mask, self.user_based



## Network

In [130]:

from collections import OrderedDict

# AutoEncoder
class AutoEncoder(nn.Module):
    def __init__(self, hidden, dropout=0.1):
        super(AutoEncoder, self).__init__()
        d1 = OrderedDict()
        for i in range(len(hidden)-1):
            d1['enc_linear' + str(i)] = nn.Linear(hidden[i], hidden[i + 1])#nn.Linear(input,out,bias=True)
            #d1['enc_bn' + str(i)] = nn.BatchNorm1d(hidden[i+1])           含偏置项！
            d1['enc_drop' + str(i)] = nn.Dropout(dropout)
            d1['enc_relu'+str(i)] = nn.ReLU() 
        self.encoder = nn.Sequential(d1)
        d2 = OrderedDict()#顺序排序
        for i in range(len(hidden) - 1, 0, -1):
            d2['dec_linear' + str(i)] = nn.Linear(hidden[i], hidden[i - 1])
            #d2['dec_bn' + str(i)] = nn.BatchNorm1d(hidden[i - 1])
            d2['dec_drop' + str(i)] = nn.Dropout(dropout)#0.1的概率舍弃神经元，避免过拟合
            d2['dec_relu' + str(i)] = nn.Sigmoid()
        self.decoder = nn.Sequential(d2)

    def forward(self, x):
        #进行一种“归一化”
        x = (x-1)/4.0
        x = self.decoder(self.encoder(x))
        x = torch.clamp(x, 0, 1.0)#torch.clamp(input, min, max)
        x = x * 4.0 + 1
        return x

## Model

In [131]:
import math
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torch import optim, nn
import torch.nn.functional as F


class Model:
    def __init__(self, hidden, learning_rate, batch_size):
        self.batch_size = batch_size
        self.net = AutoEncoder(hidden)
        self.net
        #self.opt = optim.Adam(self.net.parameters(), learning_rate)
        self.opt = optim.SGD(self.net.parameters(), learning_rate, momentum=0.9, weight_decay=1e-4)
        self.feature_size = hidden[0] # n_user/n_item

    def run(self, trainset, trainlist, testlist, num_epoch):
        for epoch in range(1, num_epoch + 1):
            #print "Epoch %d, at %s" % (epoch, datetime.now())
            train_loader = DataLoader(trainset, self.batch_size, shuffle=True, pin_memory=True)
            self.train(train_loader, epoch)
            self.test(trainset, trainlist, False)
            self.test(trainset, testlist, True)
    #批训练
    def train(self, train_loader, epoch):
        
        print ("Epoch %d:" % epoch)
        
        self.net.train()
        features = Variable(torch.FloatTensor(self.batch_size, self.feature_size))
        masks = Variable(torch.FloatTensor(self.batch_size, self.feature_size))

        for bid, (feature, mask) in enumerate(train_loader):
            if mask.shape[0] == self.batch_size:
                features.data.copy_(feature)
                masks.data.copy_(mask)
            else:
                features = Variable(feature)
                masks = Variable(mask)
            self.opt.zero_grad()
            output = self.net(features)
            loss = F.mse_loss(output* masks, features* masks)
            #loss = F.mse_loss(output, features)
            loss.backward()
            self.opt.step()


    def test(self, trainset, testlist, test=True):
        self.net.eval()
        x_mat, mask, user_based = trainset.get_mat()
        features = Variable(x_mat)
        xc = self.net(features)
        if not user_based:
            xc = xc.t()
        xc = xc.cpu().data.numpy()

        rmse = 0.0
        for (i, j, r) in testlist:
            rmse += (xc[int(i)][int(j)]-r)*(xc[int(i)][int(j)]-r)
        rmse = math.sqrt(rmse / len(testlist))

        if test:
            print ("Test RMSE = %f" % rmse)
        else:
            print ("Train RMSE = %f" % rmse, end='   ')

    def rmse(test, pred):
        error_df = test - pred
        count = np.sum(error_df.count())
        return np.sqrt(np.sum(np.sum((error_df)**2)) / count)

In [132]:
def rmse(test, pred):
    error_df = test - pred
    count = np.sum(error_df.count())
    return np.sqrt(np.sum(np.sum((error_df)**2)) / count)

In [133]:
import sys
sys.path.append('../../') 

from datetime import datetime
#from torch.utils import data

# parameters
rank = 100
batch_size = 128
user_based = False

rating_dataset = '../OfflineFiles/MovieLens/ml-latest-small/ratings.csv'

n_user = 611
n_item = 197594
start = datetime.now()
# train_list, test_list, n_user, n_item = load_data(rating_dataset)
trainset = Dataset(train.to_numpy(), n_user, n_item, user_based)
if user_based :
    h = n_item
else:
    h = n_user

mod = Model(hidden=[h, rank*3],
                    learning_rate = 0.2,
                    batch_size=batch_size)

mod.run(trainset, train.to_numpy(), test.to_numpy(), num_epoch=50)

end = datetime.now()
print ("Total time: %s" % str(end-start))

Epoch 1:
Train RMSE = 1.153282   Test RMSE = 1.157134
Epoch 2:
Train RMSE = 1.141274   Test RMSE = 1.144400
Epoch 3:
Train RMSE = 1.130567   Test RMSE = 1.133083
Epoch 4:
Train RMSE = 1.120282   Test RMSE = 1.122205
Epoch 5:
Train RMSE = 1.110785   Test RMSE = 1.112342
Epoch 6:
Train RMSE = 1.101756   Test RMSE = 1.103027
Epoch 7:
Train RMSE = 1.094563   Test RMSE = 1.095685
Epoch 8:
Train RMSE = 1.088433   Test RMSE = 1.089449
Epoch 9:
Train RMSE = 1.083418   Test RMSE = 1.084396
Epoch 10:
Train RMSE = 1.080041   Test RMSE = 1.080963
Epoch 11:
Train RMSE = 1.077119   Test RMSE = 1.077989
Epoch 12:
Train RMSE = 1.074866   Test RMSE = 1.075787
Epoch 13:
Train RMSE = 1.073316   Test RMSE = 1.074275
Epoch 14:
Train RMSE = 1.071914   Test RMSE = 1.072875
Epoch 15:
Train RMSE = 1.070309   Test RMSE = 1.071339
Epoch 16:
Train RMSE = 1.069932   Test RMSE = 1.070953
Epoch 17:
Train RMSE = 1.069361   Test RMSE = 1.070446
Epoch 18:
Train RMSE = 1.069117   Test RMSE = 1.070123
Epoch 19:
Train RMS