In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from sklearn.utils import shuffle
from torch.autograd import Variable

In [2]:
df=pd.read_csv('ratings.csv',names=['userID','movieID','rating','time'])
df.drop('time',axis=1,inplace=True)
users=[k for k,v in df['userID'].value_counts().iteritems() if v>2]
movies=[k for k,v in df['movieID'].value_counts().iteritems() if v>10]
df=df[(df['userID'].isin(users)) & (df['movieID'].isin(movies))]
#df=df.sample(frac=1).reset_index(drop=True)
df=df.pivot(index='userID',columns='movieID',values='rating')
df.reset_index(drop=True,inplace=True)
print ('Dataframe size: {}'.format(df.shape))

Dataframe size: (240447, 14277)


In [3]:
train_df=(df.loc[:220000-1])
print ('Train dataframe size: {}'.format(train_df.shape))
test_df=df.loc[230000:].reset_index(drop=True)
print ('Test dataframe size: {}'.format(test_df.shape))
val_df=df.loc[220000:230000-1].reset_index(drop=True)
print ('Validation dataframe size: {}'.format(val_df.shape))

Train dataframe size: (220000, 14277)
Test dataframe size: (10447, 14277)
Validation dataframe size: (10000, 14277)


In [4]:
class Autorec(nn.Module):
    def __init__(self, hidden_size_1, hidden_size_2, dropout, input_size):
        super(Autorec, self).__init__()
        self.input_size=input_size
        self.hidden_size_1=hidden_size_1
        self.hidden_size_2=hidden_size_2
        
        self.encoder_l1=nn.Linear(self.input_size, self.hidden_size_1)
        self.encoder_l2=nn.Linear(self.hidden_size_1, self.hidden_size_2)
        self.decoder_l1=nn.Linear(self.hidden_size_2, self.hidden_size_1)
        self.decoder_l2=nn.Linear(self.hidden_size_1, self.input_size)
        self.drop = nn.Dropout(dropout)
        #self.sigmoid=nn.LogSigmoid()
        
        
    def forward(self, input_ratings):
        #input_ratings=F.normalize(input_ratings)
        enc_out = F.relu(self.encoder_l2(F.relu(self.encoder_l1(input_ratings))))
        enc_out=self.drop(enc_out)
        dec_out = self.decoder_l2(F.relu(self.decoder_l1(enc_out)))
        return dec_out

In [5]:
def train_minibatch(input_ratings, autorec, optimizer,criterion):
    optimizer.zero_grad()
    input_ratings=input_ratings.type(torch.cuda.FloatTensor)
    mask=input_ratings!=0
    mask=mask.type(torch.cuda.FloatTensor)
    output_ratings=autorec(input_ratings)*mask
    #loss=torch.mean(torch.sum((output_ratings-input_ratings)**2,-1)/torch.sum(mask,-1))
    loss=criterion(input_ratings, output_ratings)
    loss.backward()
    optimizer.step()
    return torch.sqrt(loss)

In [6]:
def validation(input_ratings, autorec):
    with torch.no_grad():
        input_ratings=input_ratings.type(torch.cuda.FloatTensor)
        mask=input_ratings!=0
        mask=mask.type(torch.cuda.FloatTensor)
        output_ratings=autorec(input_ratings)*mask
        loss=0
        for i in range(output_ratings.size(0)):
            indices=torch.nonzero(mask[i])
            l=0
            for idx in indices:
                l+=(input_ratings[i][idx]-output_ratings[i][idx])**2
            loss+=l/indices.size(0)
        #loss=torch.mean(torch.sum((output_ratings-input_ratings)**2,-1)/torch.sum(mask,-1))
        return (torch.sqrt(loss/mask.size(0))).item()

In [7]:
autorec=Autorec(hidden_size_1=1024, hidden_size_2=512, dropout=0.5, input_size=train_df.shape[1])
optimizer=optim.Adam(autorec.parameters())
device=torch.device('cuda')
criterion=nn.MSELoss()
autorec=autorec.to(device)

In [8]:
#val_mask=val_df.copy()
#val_mask[~val_mask.isnull()] = 1  # not nan
#val_mask[val_mask.isnull()] = 0   # nan
val_df[val_df.isnull()] = 0   # nan

In [9]:
input_users_val=torch.from_numpy(val_df.values).to(device).detach()
#mask_val=torch.from_numpy(val_mask.values).to(device).detach()

In [12]:
checkpoint = torch.load('model_autorec.pth')
autorec.load_state_dict(checkpoint['autorec_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
vl=checkpoint['val_loss']
tl=checkpoint['training_loss']
autorec.eval()

Autorec(
  (encoder_l1): Linear(in_features=14277, out_features=1024, bias=True)
  (encoder_l2): Linear(in_features=1024, out_features=512, bias=True)
  (decoder_l1): Linear(in_features=512, out_features=1024, bias=True)
  (decoder_l2): Linear(in_features=1024, out_features=14277, bias=True)
  (drop): Dropout(p=0.5)
)

In [11]:
num_batches=10
#val_benchmark=2
#vl=[]
#tl=[]
counter=0

for batch in range(0,num_batches):
    running_loss=0
    counter=0
    #train_df = shuffle(train_df)
    while counter <train_df.shape[0]:
        tdf=(train_df.loc[counter:counter+99].copy())
        #train_mask=tdf.copy()
        #train_mask[~train_mask.isnull()] = 1  # not nan
        #train_mask[train_mask.isnull()] = 0   # nan
        tdf[tdf.isnull()] = 0
        
        assert tdf.shape[0]==100
        input_users=Variable(torch.from_numpy(tdf.values)).to(device)
        #input_mask=torch.from_numpy(train_mask.values).to(device)
        loss=train_minibatch(input_users, autorec, optimizer,criterion)
        running_loss+=loss.item()
        tl.append(loss.item())
        
        if (counter+100)%44000==0:
            val_rmse=validation(input_users_val, autorec)
            vl.append(val_rmse)
            print ('Batch: {} | Step: {}/{} | Training Loss: {} | Validation RMSE: {}'.format(batch+1,int((counter+100)/44000),5,running_loss,round(val_rmse,4) ))
            running_loss=0
            if(val_rmse<val_benchmark):
                print ('%---Saving the model---%')
                torch.save({
                    'step':counter+1,
                    'autorec_state_dict': autorec.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'batch':batch,
                    'val_loss':vl,
                    'training_loss':tl
                   },'model_autorec.pth')
                val_benchmark=val_rmse
        counter=counter+100

Batch: 1 | Step: 1/5 | Training Loss: 7206.790283203125 | Validation RMSE: 0.9783
Batch: 1 | Step: 2/5 | Training Loss: 7233.668427467346 | Validation RMSE: 0.977
Batch: 1 | Step: 3/5 | Training Loss: 7211.009561538696 | Validation RMSE: 0.9774
Batch: 1 | Step: 4/5 | Training Loss: 7329.833559036255 | Validation RMSE: 0.9752
Batch: 1 | Step: 5/5 | Training Loss: 7434.099720954895 | Validation RMSE: 0.9821
Batch: 2 | Step: 1/5 | Training Loss: 7198.856888771057 | Validation RMSE: 0.9837
Batch: 2 | Step: 2/5 | Training Loss: 7515.976016044617 | Validation RMSE: 0.9769
Batch: 2 | Step: 3/5 | Training Loss: 7147.465096473694 | Validation RMSE: 0.9813
Batch: 2 | Step: 4/5 | Training Loss: 8828.26476097107 | Validation RMSE: 0.9796
Batch: 2 | Step: 5/5 | Training Loss: 7288.644400596619 | Validation RMSE: 0.9862
Batch: 3 | Step: 1/5 | Training Loss: 7147.755010604858 | Validation RMSE: 0.9845
Batch: 3 | Step: 2/5 | Training Loss: 8041.410876274109 | Validation RMSE: 0.9781
Batch: 3 | Step: 3

KeyboardInterrupt: 

In [13]:
#test_mask=test_df.copy()
#test_mask[~test_mask.isnull()] = 1  # not nan
#test_mask[test_mask.isnull()] = 0   # nan
test_df[test_df.isnull()] = 0   # nan

In [14]:
input_users_test=torch.from_numpy(test_df.values).to(device).detach()
#mask_test=torch.from_numpy(test_mask.values).to(device).detach()
#print ('RMSE: {}, Precision@10: {}'.format(validation(input_users_test, mask_test, autorec)))

In [15]:
rmse=validation(input_users_test, autorec)
print ('RMSE: {}'.format(rmse))

RMSE: 0.9428205490112305
