Unsupervised Learning: Bolztman Machine


In [1]:
#Develope Recommend System using Boltzman Machine
#We will here develope 2 recommend systems using Boltzman machine and autoencoders
#1. If user is going to like the movie (yes or no)
#2. Rating by the user for the movie (on a scale 1 to 5)
#Dataset: MovieLens data set (website: grouplens.org/datasets/movielens/) 

#STEP 1: Data Preprocessing will be same for both Boltzman Machine as well as Autoencoders


In [2]:
# 1. Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn    #To implement Neural Network
import torch.nn.parallel #Parallel Computations
import torch.optim as optim #Optimizer
import torch.utils.data  
from torch.autograd import Variable  #Schocastic gradient descent

In [3]:
# 2. Importing Dataset
#Dataset is not that simple so that some new technique to be used
#Here data is not seperated by comas as movie name itself can contain coma, thus in dataset it is seperated by '::'
#Movie name also might contain special characters and thus encoding is to specified
#Movie names and their features are in movies.dat(Movie ID, Movie Name, Genre)
#User information is in users.dat(user ID, Gender, Age, User's job code and Visit Code)
#Ratings.dat contains (User ID, Movie ID, Movie Rating, Time when rating given(not useful))
movies  = pd.read_csv('ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1') #header means coloumn name(here not available thus None)
users   = pd.read_csv('ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [4]:
# 3. Preparing Training and Testing Set
# We already have 5 training and test set available in another folder ml-100k
# 1 of the 5 will be used here
# First import it as pandas dataframe and then covert to array as we work on array
# Here the data is seperated using tab and thus mention it in delimiter
training_set = pd.read_csv('ml-100k/u1.base', delimiter = '\t')  #80 %
test_set = pd.read_csv('ml-100k/u1.test', delimiter = '\t')      #20 %

#Convert it into arrays to work with pytorch
training_set = np.array(training_set, dtype = 'int')
test_set = np.array(test_set, dtype = 'int')

In [5]:
# 4. Getting the number of users and movies
# We need to extract this because we need to create matrices with same number of users and same number of movies later
# Such that User --> Movie --> Rating such 3 coloumns will be there
# If user has not rated any of the movie then put 0 as rating
# int is used to avoid getting array
# We dont know maximum ID is in training or test set , thus both are to be checked
nb_users  = int(max(max(training_set[:,0]), max(test_set[:,0])))  #User numbers are nothing but highest number in User ID coloumn
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
print(nb_users)

943


In [6]:
# 5. Converts the data into an array with users in lines and movies in columns
# That is nothing but we have to create matrix with obervations (users) in rows and features (movies) in coloumns as required for NN
# Create function for this.
# Instead of creating 2D numpy array, we will create list of list so as to apply torch
# That is we will have nb_users number of lists 
# [] initializes list
new_data = []

def convert(data):
    for id_users in range(1, nb_users+1):
        id_movies = data[:, 1][data[:, 0] == id_users]  #This will give movie IDs which user id_users has rated
        id_ratings = data[:, 2][data[:, 0] == id_users] #This will give all the ratings of that user
        ratings = np.zeros(nb_movies)                 #Create list with all movies rated zeros 
        ratings[id_movies - 1] = id_ratings           #Add ratings for the movies given by users
        new_data.append(list(ratings))                #Create list of list using append
    return new_data

#Apply the above defined convert function on training and testing set
training_set = convert(training_set)
test_set = convert(test_set)

In [7]:
# 6. Converting the data into Torch Tensors
# Tensors are multidimensional array with same data type
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [8]:
# STEP 2: RESTRICTED BOLTZMAN MACHINE
# Further steps are now specific to restricted boltzman machine
# 7. Converting the ratings into binary ratings (1 liked or 0 not-liked)
# Replace all the zeros by -1
training_set[training_set == 0] = -1

# Assume the ratings 1 and 2 represents user didn't liked the movie 
# Replace all the 1 and 2 ratings with 0
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0

# Replace all retings higher than 2 with 1
training_set[training_set >= 3] = 1

# Apply same for test set
test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 0] = 1

In [9]:
# 8. Creating the architecture of the Neural Network (Restricted Boltzman Machine)
# Define the classs and 3 function (Initialization, )
class RBM():
    #First Function
    def __init__(self, nv, nh):  # nv-->Number of visible nodes and nh--> Number of hidden nodes & self is by default
        # Randomly Initialize all the parameters required for RBM i.e., weights and bias
        # Weights W are nothing but probabilities of visible node given hidden nodes thus W is matrix of size nh*nv
        # Bias are in both direction hidden <--> visible
        # One bias for each hidden node
        # Torch library doesn't accept a single vector and thus whenever required defined as a matrix with one fake size = 1
        self.W = torch.randn(nh, nv) #The tensor with random values according to normal distribution with mean 0 and var 1
        self.a = torch.randn(1, nh)  #Bias for probabilities of hidden nodes given visible node
        self.b = torch.randn(1, nv)  #Bias for probabilities of visible nodes given hidden node
    
    # Sampling hidden nodes according to probabilities P(h when given v)
    # Gibs sampling is used 
    def sample_h(self, x):        # x corresponds to visible neurons v in the probabilities Ph given v
        # 1. Probability of h given v, i.e., probability that hidden neuron equals one given values of visible neurons
        # Visible neurons are nothing but the inout vector of observations with all the ratings
        # This probability of h given v is nothing but sigmoid activation function applied to W times x plus bias (a)
        wx = torch.mm(x, self.W.t())
        activation  = wx + self.a.expand_as(wx) #expand_as() is applied to ensure bias is applied toe ach line of the batch
        p_h_given_v = torch.sigmoid(activation) #This gives probability of the hidden node activated when visible node given
        # We will return these probabilities as well as the vector speciying which hidden is activateed and which is not
        # We will deicide it using some threshold probability value and assign 0 or 1 accordingly using bernouli function
        return p_h_given_v, torch.bernoulli(p_h_given_v)
    
    # Similar function for visible node activation probability
    # Each visible node is nothing but one movie
    def sample_v(self, y):         # y corresponds to values of hidden nodes
        wy = torch.mm(y, self.W)                # no need of transpose here ; plz check matrix size
        activation  = wy + self.b.expand_as(wy) # a is replaced by b
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h, torch.bernoulli(p_v_given_h)
    
    # Last function: Contrastive Divergence
    # Here we approximate the RBM log-likelihood gradient
    # The RBM is Energy Based Model and also can be seen as probabilistic model and here we need to maximize the liklihood
    # To calculate maximum likelihood we need to compute the gradient
    # Gradientcomputation here is very heavy and thus we try to approximate it
    def train(self, v0, vk, ph0, phk):
        #v0 = Input vector (ratings of all the movies by one user)
        #vk = Visible nodes obtained after k samplings (that is k iterations and k conrastive divergences)  
        #pho = vectors of probabilities that at 1st iteration the hidden nodes equal to 1 given v0
        #phk = vectors of probabilities that at kth iteration the hidden nodes equal to 1 given vk
        
        # 1. Weights Update
        self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
        
        # 2. Bias b Update
        self.b += torch.sum((v0 - vk), 0)
        
        # 3. Bias a Update
        self.a += torch.sum((ph0 - phk), 0)
        
# Now our RBM class is ready and now we can train many models with different parameters        

In [10]:
# 9. Create 1st RBM object

# nv is nothing but number of visible nodes that is here the number of movies
nv = len(training_set[0])

# nh is the tunable parameter that we can use anything for improvement of the model
# Actually number of hidden nodes are the number of feaetures we want to detect
nh = 100

# Batch size is also the tunable parameter which is nothing but after how many obervations the weights to be updated
batch_size = 100

# Create RBM object
rbm = RBM(nv, nh)

# Thus now our RBM model is created, now we want to train it

In [11]:
# 10. Training the RBM
# We have to input other parameters required by RBM class

# Number of epochs is tunable parameter
nb_epoch = 10
for epoch in range(1, nb_epoch + 1):
    # 1. Loss Function:
    train_loss = 0   #initialize to zero
    s = 0.           # Counter for normalize the train loss for dividing it
    
    # Get batches of users for training as we want to update weights after each batch
    for id_user in range(0, nb_users - batch_size, batch_size):  # Batches like 0-99, 100-199,.......,799-843
        vk = training_set[id_user:id_user + batch_size]          # Target
        v0 = training_set[id_user:id_user + batch_size]          # Input
        ph0,_ = rbm.sample_h(v0)                                 # Initial Probabilities givevn the visible node
        for k in range(10):                                      # K times contactive divergence (Gibbs Sampling)
            # In begining, we start with initial ratings for the batch users
            # Then sample first hidden nodes using bernouli's sampling using PH given v0 distribution(sample_h)
            _,hk = rbm.sample_h(vk)                              # hk = sampled hidden nodes obtained at Kth step of contrastive divergence
            _,vk = rbm.sample_v(hk)                                              # vk = sampled visible nodes "                  "
            # Now approximate the gradient using updated vk
            # Before that, remember the fact that we dont want to learn where ther is no rating (user hasnt given rating, i.e., -1 rating in v-0)
            vk[v0<0] = v0[v0<0]                                  # So we have given these nodes value again equal to -1
         
        phk,_ = rbm.sample_h(vk)
        
        # Update the weights sing train function defined above
        rbm.train(v0, vk, ph0, phk)
        
        # Update the train loss(difference between predicted and original raatings)
        train_loss += torch.mean(torch.abs(v0[v0>0] - vk[v0>0]))
        s += 1.                                                   # Counter
    print('epoch: '+str(epoch) + ' loss: '+str(train_loss/s))    
        
            

epoch: 1 loss: tensor(0.3460)
epoch: 2 loss: tensor(0.1634)
epoch: 3 loss: tensor(0.1441)
epoch: 4 loss: tensor(0.1549)
epoch: 5 loss: tensor(0.1516)
epoch: 6 loss: tensor(0.1476)
epoch: 7 loss: tensor(0.1540)
epoch: 8 loss: tensor(0.1494)
epoch: 9 loss: tensor(0.1508)
epoch: 10 loss: tensor(0.1470)


In [12]:
# 11. Testing the RBM using above trained trainined model
test_loss = 0
s = 0.
for id_user in range (nb_users):
    v  = training_set[id_user: id_user + 1]           #This will remain a training set for predicting the test results
    vt = test_set[id_user:id_user+1]     #Actual Target
    if len(vt[vt>0]) > 0:
        _, h = rbm.sample_h(v)
        _, v = rbm.sample_v(h)
    test_loss += torch.mean(torch.abs(vt[vt>0] - v[vt>0]))
    s += 1.
print('test_loss: '+str(test_loss/s))    

test_loss: tensor(0.2016)
