#Boltzmann Machine

##Downloading the dataset

###ML-100K

In [1]:
# !wget "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
# !unzip ml-100k.zip
# !ls

###ML-1M

In [2]:
# !wget "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
# !unzip ml-1m.zip
# !ls

##Importing the libraries

In [15]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

## Importing the dataset


* sep='::' specifies the separator used to separate values in the file. In this case, it's a double colon '::'. This separator is used because the dataset may not be a standard CSV (Comma-Separated Values) file.

* header=None indicates that there is no header row in the dataset, and the first row should be treated as data, not column names.

* engine='python' specifies the parsing engine to use. In this case, it's set to 'python', which is used when non-standard separators like '::' are used.

* encoding='latin-1' specifies the character encoding used in the file.
'latin-1' is a commonly used encoding for handling text data. (in our case we used latin-1 as data contains special characters or symbols that are better represented in 'latin-1', using this encoding ensures that those characters are correctly read.)



In [16]:
movies = pd.read_csv('ml-1m/movies.dat', sep = '::', header=None, engine='python', encoding='latin-1')
users = pd.read_csv('ml-1m/users.dat', sep = '::', header=None, engine='python', encoding='latin-1')
ratings = pd.read_csv('ml-1m/ratings.dat', sep = '::', header=None, engine='python', encoding='latin-1')


## Preparing the training set and the test set


In [17]:
training_set = pd.read_csv('ml-100k/u1.base', sep='\t')
# 1 colom - users, 2nd colom - movies and 3rd colom - ratings we dont need 4th colom same in test_set
training_set = np.array(training_set, dtype='int')

In [18]:
# in test set we have same users but movies associated with the same user is difffernt from training set
test_set = pd.read_csv('ml-100k/u1.test', sep='\t')
test_set = np.array(test_set, dtype='int')

## Getting the number of users and movies


In [19]:
# in upcoming setps we will convert training and test set in to matrix where
# line represents - users, colorings - movies and cells - ratings
# in each of this two matices we will include all users and movies from original dataset

# in cells where user did not rate we will put '0' by default
# we want to get maximum number of users in data set if we get max form training set it wont work
# as users and movies are not distributeed in same way, it possible that user with highest userID could be in test set
# splits are random so we could have highest userID in one of the test set
# we will take max of user_id in training and test sets by doing that we will get max number of user and movies

# we need to do this step if we want to apply 5 k-cross validation sets from u1 to u5

nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))  # here we are taking max of both for the user colunm which is 0th colunm same for movies
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

In [20]:
print('max number of users:', nb_users)
print('max numebr of movies:', nb_movies)

max number of users: 943
max numebr of movies: 1682


## Converting the data into an array with users in lines and movies in columns

* [data[:, 0] == id_user] creates a boolean mask by comparing the values in the first column (index 0) of the data array to a specific value id_user. This mask will have True where the condition is met and False where it's not.

In [21]:
# we will create matrice which has max user count as row and max movie count and column

def convert(data):
  # we are creating list of list each line representing unique user
  new_data =[]

  # user value start form 1 to value of nb_user and in range it does not inclue upperbound so we have to add +1
  for id_user in range(1, nb_users + 1):
    id_movies = data[:, 1][data[:, 0] == id_user] # contains index/id of movie rated by each user
    id_ratings = data[:, 2][data[:, 0] == id_user] # contains ratings rated by each user we will put '0' for movies which are not rated

    ratings = np.zeros(nb_movies) # creating rating var containing list of 0, len of total movies

    # index of movies start at 1 but index of rating start at zero so we need to substract by 1
    ratings[id_movies - 1] = id_ratings # now we have list of rating for all the movies ids for all users

    new_data.append(list(ratings)) # added rating to new_data userwise we are creating list of list and rating is array so we need to convert

  return new_data

In [22]:
training_set = convert(training_set)
test_set = convert(test_set)
# so now we have matrices comntaing ratings for all movies for all user( row-user, column each movie)

## Converting the data into Torch tensors


In [23]:
training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

## Converting the ratings into binary ratings 1 (Liked) or 0 (Not Liked)


In [24]:
# we need to convert rating value to 0's and 1's as RBM will predict input values which are not-rated(0) to liked(1) and not liked(0)
# since we are predicting 0 and 1, not-rated movies should have different value like (-1)
training_set[training_set == 0] = -1

# we can consider that if user has rated movie 1 or 2 out of 5 then he did not like the movie so are changing it to 0(not liked)
training_set[training_set == 1] = 0
training_set[training_set == 2] = 0

# for movie 3 or more rating we are considering user liked movie giving it value 1
training_set[training_set >= 3] = 1


# same for test set
test_set[test_set == 0] = -1
test_set[test_set == 1] = 0
test_set[test_set == 2] = 0
test_set[test_set >= 3] = 1

## Creating the architecture of the Neural Network


In [25]:
class RBM():

  def __init__(self, nv, nh):
    '''initializing number_of_visible_node(nv), number_of_hidden_node(nh), weights and bias'''
    self.W = torch.randn(nh, nv)  # creates random tensors
    self.a = torch.randn(1, nh) # pytorch dont accept 1d tensor so we need to give batch and number of bias of hidden note
    self.b = torch.randn(1, nv) # bias for visible node



  def sample_h(self, x):
    '''sample_h function computes the probabilities of the hidden units being active given the visible units, and then stochastically samples the
    states of the hidden units based on these probabilities.'''

    # finding probability of h given v (which is ~= sigmoid function)
    # .mm is for tensor matrix multiplication. It calculates the probability of hidden neurons given visible neurons.
    # x is the visible node, and self.W is the weight matrix.
    # take transpose to match dimension, weight dimension are (hidden, input_nods) and dimension x will be (n, input_nodes) where n can 1 to any nnumber
    wx = torch.mm(x, self.W.t())
    # 'wx' represents the product of the visible layer values and the transposed weight matrix.
    # It calculates the linear combination of the weights and visible units for each data point.


    activation = wx + self.a.expand_as(wx)
    # dimesion of batch might not match with wx and we need to make sure bias is added to eachline of the mini-batch so we used expand_as(wx) expand as much as weights
    # 'self.a' is a bias term for the hidden layer. It is added to each row of 'wx'.

    P_h_given_v = torch.sigmoid(activation)
    # 'P_h_given_v' represents the probability of the hidden units being activated given the visible units.
    # It's calculated using the sigmoid function, which maps the linear combination to values between 0 and 1.

    return P_h_given_v, torch.bernoulli(P_h_given_v)
    # The function returns two values:
    # 1. 'P_h_given_v': The probabilities of the hidden units being activated given the visible units.
    # 2. 'torch.bernoulli(P_h_given_v)': This is a sampling operation. It samples binary values (0 or 1) for the hidden units
    #    based on the calculated probabilities. This step introduces stochasticity into the model.(calculates which neurons were activated based on P_h_given_v 1-activated)



  def sample_v(self, y):
    '''sample_v function computes the probabilities of the visible units being active given the hidden units, and then stochastically samples the
    states of the visible units based on these probabilities.'''

    wy = torch.mm(y, self.W)
    # here y represent hidden node
    # we dont need to take tranpose of weights as dimeention are (hidden, input) and dimension of y is (n, hidden)

    activation = wy + self.b.expand_as(wy)

    P_v_given_h = torch.sigmoid(activation)

    return P_v_given_h, torch.bernoulli(P_v_given_h)


  def train(self, v0, vk, ph0, phk):
    '''Train RBM model using Contrastive Divergence (CD) which uses Gibbs Sampling.'''

    # v0: input vactor containing review of all movies by one user
    # vk: visible nodes obtained after k-samplings (steps involed are forward_pass[sample_h] and backward_pass[sample_v])
    # ph0: vactor of probability that at first iteration the hidden node equal to 1 given value of v0
    # phk: probabilities of hidden node after k-samplling given the value of vk

    self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
    self.b += torch.sum((v0 - vk), 0)
    # Sum along the 0th dimension (reducing the dimension)
    self.a += torch.sum((ph0 - phk), 0)

In [26]:
nv = len(training_set[0]) # number of visible node
nh = 100 # number of hidden node
batch_size = 100

# creating class instance
rbm = RBM(nv,nh)

## Training the RBM


In [27]:
nb_epoch = 10

for epoch in range(1, nb_epoch + 1):

  train_loss = 0 # simple difference in abs vaalue is the loss function in out model
  s = 0.0 # counter to normalize train loss, by dividing loss by counter

  for id_user in range(0, nb_users - batch_size, batch_size):

    vk = training_set[id_user : id_user + batch_size]
    # vk will be output of gibs sampling[id_user + 100]

    v0 = training_set[id_user : id_user + batch_size]
    # original ratings

    ph0,_ = rbm.sample_h(v0)
    # probability of hidden node being 1 at first iteration given input node v0
    # sample_h returns two value and we only need probability which first value that is why we user '_'

    # for loop for k steps for Contrastive Divergence
    for k in range(10):

      '''at the end we get 10th sample of hidden and visible node'''

      _,hk = rbm.sample_h(vk)
      # hidden nodes obtained at  k steps for Contrastive Divergence

      _,vk = rbm.sample_v(hk)
      # visible nodes after sampling

      vk[v0<0] = v0[v0<0]
      # we need to keep the node which were not rated to same -1 value

    phk,_ = rbm.sample_h(vk)
    #  probabilities of hidden node after k-samplling given the value of vk

    rbm.train(v0, vk, ph0, phk)
    train_loss += torch.mean(torch.abs(v0[v0>=0] - vk[v0>=0])) # we only want to calculate loss for rating that exist
    s += 1.0

  print('epoch: ' + str(epoch) + 'loss: ' + str(train_loss/s))

epoch: 1loss: tensor(0.3515)
epoch: 2loss: tensor(0.2294)
epoch: 3loss: tensor(0.2518)
epoch: 4loss: tensor(0.2488)
epoch: 5loss: tensor(0.2468)
epoch: 6loss: tensor(0.2494)
epoch: 7loss: tensor(0.2488)
epoch: 8loss: tensor(0.2453)
epoch: 9loss: tensor(0.2464)
epoch: 10loss: tensor(0.2488)


## Testing the RBM


In [28]:



test_loss = 0 # simple difference in abs vaalue is the loss function in out model
s = 0.0 # counter to normalize test loss, by dividing loss by counter

for id_user in range(nb_users):

  v = training_set[id_user : id_user + 1]
  # v will be output of gibs sampling which we will compare to the output of test_set
  # we are using training_set to activate neuron to predict test_set

  vt = test_set[id_user : id_user + 1]
  # original ratings

  # ph0 is not needed to train data and we dont need to re-calculate for test
  # probability of hidden node being 1 at first iteration given input node v0

  if len(vt[vt>=0]) > 0:

    _,h = rbm.sample_h(v)
    # hidden nodes obtained at  k steps for Contrastive Divergence

    _,v = rbm.sample_v(h)
    # visible nodes after sampling

    test_loss += torch.mean(torch.abs(vt[vt>=0] - v[vt>=0]))
    s += 1.0

print('test loss: ' + str(test_loss/s))

test loss: tensor(0.2341)


In [None]:
'''With this metric, we obtained an Average Distance of 0.24, which is equivalent to about 75% of correct prediction.

Hence, it works very well and there is a predictive power.'''