In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import scipy

import math
from livelossplot import PlotLosses

from sklearn.metrics import roc_curve, auc


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")


# Explore Data

In [30]:
rating_df = pd.read_csv('./book-data/ratings.csv', header = 0)
rating_df.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [14]:
books_df = pd.read_csv('./book-data/books.csv', header = 0)
books_df.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [20]:
books_df = pd.read_csv('./book-data/books.csv', header = 0)
books_df.head()

(981756, 3)

# Data Process

In [26]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, user_item_pairs, ratings):
        'Initialization'
        self.labels  = ratings
        self.samples = user_item_pairs

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.samples)

    def __getitem__(self, index):
        'Generates one sample of data'
        # # Load data and get label
        #print("called get item")
        user_item_pair = self.samples[index].astype('long')
        user_social = np.zeros(64).astype('long') #convert to actual social embeddings later
        user_item_pair_social = np.concatenate((user_item_pair, user_social), axis=None)
        X = user_item_pair_social
        y = self.labels[index]
        return X, y

In [93]:
def pre_process_rating(df,count):
    user_rating_count = df.groupby('user_id')['user_id'].agg(['count']).reset_index()
    cdf = df.merge(user_rating_count, on='user_id')
    cdf = cdf[cdf['count']>count][['user_id','book_id','rating']]
    user_bias = cdf['user_id'].min()
    cdf[['user_id']] -user_bias
    # num_users = len(user_rating_count)
    
    num_users = int(cdf['user_id'].max() - cdf['user_id'].min() + 1)
    num_items = int(cdf['book_id'].max() - cdf['book_id'].min() + 1)
    
    total_ratings = np.array(cdf.values)
    total_ratings[:,0:1]-= cdf['user_id'].min()
    user_item_pairs = total_ratings[:,0:2]
    ratings = total_ratings[:,2:3]
    dataset = Dataset(user_item_pairs,ratings)
    
    return {"rating_df":cdf,"dataset":dataset ,"num_users":num_users,"num_items":num_items,"user_bias":user_bias}

# Models


## Generalized Matrix Factorization 

An abstrsct of traditional MF method, the products of embeding layout are ratings of user give to each item
if only use one dimention it will be traditional MF, but the neural network has expand that structure, and allows it can catch the features that traditional MF model can't catchs.

element-wise


In [24]:
class GMF(torch.nn.Module):
    def __init__(self, num_users, num_items,latent_dim=8):
        super(MF, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.latent_dim = latent_dim
        
        self.embedding_user = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim,sparse=True)
        self.embedding_item = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim,sparse=True)
        self.fc = nn.Linear(in_features=self.latent_dim, out_features=1, bias=True)
        
    def forward(self, user_indices, item_indices):
        user_embedding = F.relu(self.embedding_user(user_indices))
        item_embedding = F.relu(self.embedding_item(item_indices))
        return F.relu(self.fc(user_embedding * item_embedding))

## Multi-Layer Perceptron 

concatenation

The GMF only use one layer, but MLP use multi-layout 

In [None]:
class MLP(torch.nn.Module):
    def __init__(self, num_users, num_items,latent_dim=8,layers = [16,32,16,8]):
        super(MLP, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.latent_dim = latent_dim

        self.embedding_user = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim,sparse=True)
        self.embedding_item = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim,sparse=True)

        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(layers[:-1], layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        self.affine_output = torch.nn.Linear(in_features=layers[-1], out_features=1)
#         self.logistic = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices):
#         print("item_embedding")
        user_embedding = self.embedding_user(user_indices)
        item_embedding = self.embedding_item(item_indices)
        
        vector = torch.cat([user_embedding, item_embedding], dim=-1)  # the concat latent vector
#         print("vector",vector)
        for idx, _ in enumerate(range(len(self.fc_layers))):
            vector = self.fc_layers[idx](vector)
            vector = torch.nn.ReLU()(vector)
            # vector = torch.nn.BatchNorm1d()(vector)
            # vector = torch.nn.Dropout(p=0.5)(vector)
        out = self.affine_output(vector)
#         rating = self.logistic(logits)
        return out

## Neural Matrix Factorization 

In [85]:
class NeuMF(torch.nn.Module):
    def __init__(self,  num_users, num_items,latent_dim_mf=8,latent_dim_mlp=8,layers=[16,32,16,8]):
        super(NeuMF, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.latent_dim_mf = latent_dim_mf
        self.latent_dim_mlp = latent_dim_mlp

        self.embedding_user_mlp = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mlp,sparse=True)
        self.embedding_item_mlp = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mlp,sparse=True)
        self.embedding_user_mf = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mf,sparse=True)
        self.embedding_item_mf = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mf,sparse=True)

        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(layers[:-1], layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))

        self.affine_output = torch.nn.Linear(in_features=layers[-1] + latent_dim_mf, out_features=1)
#         self.logistic = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices):
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)
        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)

        mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)  # the concat latent vector
        mf_vector =torch.mul(user_embedding_mf, item_embedding_mf)

        for idx, _ in enumerate(range(len(self.fc_layers))):
            mlp_vector = self.fc_layers[idx](mlp_vector)
            mlp_vector = torch.nn.ReLU()(mlp_vector)

        vector = torch.cat([mlp_vector, mf_vector], dim=-1)
        out = self.affine_output(vector)
#         rating = self.logistic(logits)
        return out

In [134]:
def predict_eval(model, generator):
    model.eval()
    y_preds_all = torch.Tensor().to(device) 
    y_labels_all = torch.Tensor().to(device) 
    y_pairs_all = torch.Tensor().type(torch.long).to(device) 
    
    for local_batch, local_labels in generator:
        local_batch  = torch.tensor(local_batch).type(torch.long).to(device)
        local_labels = local_labels.type(torch.float).to(device)
        with torch.no_grad():
            y_preds = model(local_batch[:,0], local_batch[:,1])
        y_preds_all = torch.cat((y_preds_all,y_preds))
        y_labels_all = torch.cat((y_labels_all,local_labels))
        y_pairs_all = torch.cat((y_pairs_all,local_batch[:,0:2]))
        
    return y_preds_all, y_labels_all ,y_pairs_all


In [132]:
def predict(user_id):
    user_rating_pair = [(np.zeros((num_items)) + user_id),np.arange(0,num_items,1)]
    local_batch  = torch.tensor(user_rating_pair).type(torch.long).to(device)
    with torch.no_grad():
        y_preds = NeuMF_model(local_batch[0], local_batch[1])
    return y_preds

# TEST Train

In [66]:
def epoch_run(model, generator, opt, criterion,liveloss,mode="train"):
    running_loss = 0
    if(mode == "train"):
        model.train()
    else:
        model.eval()
    i = 0
    for local_batch, local_labels  in generator:
        local_batch  = torch.tensor(local_batch).type(torch.long).to(device)
        local_labels = local_labels.type(torch.float).to(device)
        
        y_preds = model(local_batch[:,0], local_batch[:,1])
        loss = criterion(y_preds, local_labels)

        running_loss += (loss.item()*local_labels.size()[0])
        if(mode == "train"):
            opt.zero_grad()
            loss.backward()
            opt.step()
            liveloss.update({
                'mse':loss.item()
            })
            liveloss.draw()

In [94]:
# preprocess data
pre_processed = pre_process_rating(rating_df,50)
train_df = pre_processed["rating_df"]
num_users = pre_processed["num_users"]
num_items = pre_processed["num_items"]
train_dataset = pre_processed["dataset"]
user_bias = pre_processed["user_bias"]

In [75]:
sample_params = {'batch_size': 1024,'shuffle': True,'num_workers': 0}
train_generator = torch.utils.data.DataLoader(train_dataset, **sample_params)

In [89]:
# define model
NeuMF_model = NeuMF(num_users+1,num_items+1).to(device)
NeuMF_opt = optim.SGD(NeuMF_model.parameters(),lr=0.001)
NeuMF_criterion = torch.nn.MSELoss()

# NeuMF_model.load_state_dict(torch.load("./md_checkpoint/neumf_1.pkg"))

In [90]:
liveloss1 = PlotLosses()
liveloss2 = PlotLosses()
liveloss2 = PlotLosses()

In [158]:
torch.save(NeuMF_model.state_dict(), "./md_checkpoint/neumf_1.pkg")

In [159]:
for i in range(0,10):
    epoch_run(NeuMF_model,train_generator,NeuMF_opt,NeuMF_criterion,liveloss1,"train")
    if i %5:
        torch.save(NeuMF_model.state_dict(), "./md_checkpoint/neumf_"+i+".pkg")

# Eval and Test

In [153]:
rating_df.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [154]:
# show result contrast
USERID = 439
predict_y = predict(USERID-user_bias)
predict_df = pd.DataFrame(predict_y.numpy()).reset_index()
rating_df[rating_df['user_id']==USERID].merge(predict_df,left_on="book_id",right_on="index")

Unnamed: 0,book_id,user_id,rating,index,0
0,1,439,3,1,3.570163
1,4,439,5,4,3.711751
2,7,439,3,7,3.536054
3,9,439,3,9,3.527329
4,11,439,4,11,3.575893
5,17,439,5,17,3.856813
6,18,439,4,18,4.007541
7,20,439,5,20,3.984340
8,22,439,3,22,3.791241
9,23,439,2,23,3.657006
