<a href="https://colab.research.google.com/github/Priyo-prog/Movie-Recommendation-using-Boltzmann-Machine/blob/main/Boltzmann_Machine_movie_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie recommendation system using Boltzmann Machine

* Use the data from Movielens to analyze the data on movies https://grouplens.org/datasets/movielens/

* Use Torch Libraries to build a Boltzmann Machine

## Import important libraries

In [1]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

## Import the data

In [2]:
# Mount the drive amd import the data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import the dataset
movies = pd.read_csv('/content/drive/MyDrive/Deep Learning/Boltzmann Machine Datasets/ml-1m/movies.dat', 
                     sep='::', header=None, engine='python', encoding='latin-1')

users = pd.read_csv('/content/drive/MyDrive/Deep Learning/Boltzmann Machine Datasets/ml-1m/users.dat',
                    sep = '::', header=None, engine='python', encoding='latin-1')
ratings = pd.read_csv('/content/drive/MyDrive/Deep Learning/Boltzmann Machine Datasets/ml-1m/ratings.dat',
                      sep = '::', header=None, engine='python', encoding='latin-1')

In [4]:
movies.head(5)

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
users.head(5) , users.shape

(   0  1   2   3      4
 0  1  F   1  10  48067
 1  2  M  56  16  70072
 2  3  M  25  15  55117
 3  4  M  45   7  02460
 4  5  M  25  20  55455, (6040, 5))

In [6]:
ratings.head(5)

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Preparing the training and test set

In [7]:
# Import the training set and convert it to an array
training_set = pd.read_csv('/content/drive/MyDrive/Deep Learning/Boltzmann Machine Datasets/ml-100k/u1.base',
                           delimiter='\t')
training_set = np.array(training_set, dtype='int')

# Import the test set and convert it to array
test_set = pd.read_csv('/content/drive/MyDrive/Deep Learning/Boltzmann Machine Datasets/ml-100k/u1.test',
                       delimiter='\t')
test_set = np.array(test_set, dtype='int')

training_set.shape, test_set.shape, type(training_set), type(test_set)

((79999, 4), (19999, 4), numpy.ndarray, numpy.ndarray)

In [8]:
training_set[:5][:,2], test_set[:5]

(array([3, 4, 3, 3, 4]), array([[        1,        10,         3, 875693118],
        [        1,        12,         5, 878542960],
        [        1,        14,         5, 874965706],
        [        1,        17,         3, 875073198],
        [        1,        20,         4, 887431883]]))

In [9]:
# Get the number of users and number of movies
nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))
nb_users, nb_movies

(943, 1682)

## Convert the data into arrays with users in lines and movies in columns

In [10]:
# Create a function for this
def convert(data):
  # There will list of 943 because we 943 users and the list will have 1682 elements
  # because we have that number of movies
  new_data = [] # Initializing the list
  for id_users in range(1,nb_users + 1):
    id_movies = data[:,1][data[:,0] == id_users] # Get the movie ids of a particular users
    id_ratings = data[:,2][data[:,0] == id_users] # Get the ratings of the particular users
    # Now we want to get the list of the movies and ratings where the users has not rated the 
    # movie as 0
    ratings = np.zeros(nb_movies)
    ratings[id_movies - 1] = id_ratings
    new_data.append(list(ratings))
  return new_data  

In [11]:
training_set_conv = convert(training_set)
test_set_conv = convert(test_set)

#training_set, test_set

In [12]:
type(training_set_conv), type(test_set_conv)

(list, list)

## Convert the data into Torch tensors

In [13]:
training_set_conv = torch.FloatTensor(training_set_conv)
test_set_conv = torch.FloatTensor(test_set_conv)

In [14]:
# Converting the ratings into binary ratings '1(liked)' or '0(not liked)'
# Replace all the '0' s in the set to '-1'. Because in the list the '0' s are the ratings where the users
# actually didn't rate the movies. 
# Now since the ratings are going to be 0s and 1s, the original '0' s must have a new value now that is '-1'.
training_set_conv[training_set_conv == 0] = -1
training_set_conv[training_set_conv == 1] = 0 # OR operation doesn't work with Pytorch
training_set_conv[training_set_conv == 2] = 0 # This will also have rating 0(not liked movies)
training_set_conv[training_set_conv >= 3] = 1 # Liked Movies

# Test set
test_set_conv[test_set_conv == 0] = -1
test_set_conv[test_set_conv == 1] = 0 # OR operation doesn't work with Pytorch
test_set_conv[test_set_conv == 2] = 0 # This will also have rating 0(not liked movies)
test_set_conv[test_set_conv >= 3] = 1 # Liked Movies

## Architecture of Restricted Boltzmann Machine

In [21]:
# Create a class for RBM architecture
class RBM():
  def __init__(self, nv, nh): # 'nv'-visible node, 'nh' - hidden node
    self.W = torch.randn(nh, nv) # Weights initialized
    self.a = torch.randn(1, nh) # Create a 2-D tensor '1' for batch 'nh' for bias
    self.b = torch.randn(1, nv) # for visible node

  def sample_h(self, x):
    # Compute probabilities of 'hidden' given 'visible'
    wx = torch.mm(x, self.W.t())
    activation = wx + self.a.expand_as(wx)
    p_h_given_v = torch.sigmoid(activation)
    return p_h_given_v, torch.bernoulli(p_h_given_v)  

  def sample_v(self, y):
    # Compute probabilities of 'hidden' given 'visible'
    wy = torch.mm(y, self.W)
    activation = wy + self.b.expand_as(wy)
    p_v_given_h = torch.sigmoid(activation)
    return p_v_given_h, torch.bernoulli(p_v_given_h)      

  # Implement k-step Contrastive Divergence
  def train(self, v0, vk, ph0, phk):
    self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
    self.b += torch.sum((v0 - vk),0) # Keep the 2 dimension
    self.a += torch.sum((ph0 - phk),0)

## Training the model

In [22]:
# Parameters to be passsed to RBM
nv = len(training_set_conv[0]) # number of features in the visible node
nh = 100 # Random assumed features in the hidden nodes
batch_size = 100

In [23]:
# Create an RBM object
rbm = RBM(nv, nh)

In [24]:
# Training the RBM
nb_epoch = 10

for epoch in range(1, nb_epoch + 1):
  # Loss function to measure the error
  train_loss = 0
  # Normalize the train loss
  s = 0.
  for id_user in range(0, nb_users - batch_size, batch_size):
    vk = training_set_conv[id_user:id_user + batch_size]
    # Target, batch of original ratings we want to compare our losses
    v0 = training_set_conv[id_user:id_user + batch_size]
    ph0, _ = rbm.sample_h(v0) # To get the first element ot the function use the '_'
    for k in range(10):
      _,hk = rbm.sample_h(vk) # At this point vk = v0 later it will be updated
      _,vk = rbm.sample_v(hk)
      # Freeze the visible nodes with -1 ratings
      vk[v0<0] = v0[v0<0]
    phk,_ = rbm.sample_h(vk)
    # Use train function to update the weights and bias
    rbm.train(v0,vk,ph0,phk)
    # Update the train-loss
    train_loss += torch.mean(torch.abs(v0[v0>=0] - vk[v0>=0]))
    s += 1
  print('epoch : '+str(epoch)+' loss: '+str(train_loss/s))  

epoch : 1 loss: tensor(0.3422)
epoch : 2 loss: tensor(0.2547)
epoch : 3 loss: tensor(0.2202)
epoch : 4 loss: tensor(0.2437)
epoch : 5 loss: tensor(0.2491)
epoch : 6 loss: tensor(0.2488)
epoch : 7 loss: tensor(0.2508)
epoch : 8 loss: tensor(0.2480)
epoch : 9 loss: tensor(0.2474)
epoch : 10 loss: tensor(0.2469)


## Testing the RBM

In [None]:
# Loss function to measure the error
test_loss = 0
s = 0.
for id_user in range(0, nb_users):
   v = training_set_conv[id_user:id_user + 1]
   # Target, batch of original ratings we want to compare our losses
   vt = test_set_conv[id_user:id_user + 1]
   ph0, _ = rbm.sample_h(v0) # To get the first element ot the function use the '_'
   for k in range(10):
     _,hk = rbm.sample_h(vk) # At this point vk = v0 later it will be updated
     _,vk = rbm.sample_v(hk)
     # Freeze the visible nodes with -1 ratings
     vk[v0<0] = v0[v0<0]
   phk,_ = rbm.sample_h(vk)
   # Use train function to update the weights and bias
   rbm.train(v0,vk,ph0,phk)
   # Update the train-loss
   train_loss += torch.mean(torch.abs(v0[v0>=0] - vk[v0>=0]))
   s += 1
 print('epoch : '+str(epoch)+' loss: '+str(train_loss/s)) 