In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
mnames = ['movie_id', 'title', 'genre']
movies_df = pd.read_table('ml-1m/movies.dat', names = mnames, sep = "::", engine = 'python', encoding='ISO-8859-1')
# Loading the cleaned datasets
rnames = ['user_id','movie_id','rating','timestamp']
ratings_df = pd.read_table("ml-1m/ratings.dat", header =None, sep='::',names=rnames, engine= 'python')
uname = ['user_id','gender','age','occupation','zip']
users_df = pd.read_table("ml-1m/users.dat", sep='::', header = None, names=uname, engine='python')
ratings_df.drop(columns=['timestamp'], inplace=True, axis=1)  # Remove useless features


In [4]:
from sklearn.model_selection import train_test_split
# Splitting the ratings dataset into the feature set (X) and target labels (y)
X = ratings_df.drop(columns='rating')
y = ratings_df["rating"].values  # The movie ratings are the target variables we want to predict

# Preparing train, validation and test datasets.
# I have chosen a split ratio of 80%, 10%, 10%, because I want a somewhat large training set at the cost of a
# smaller validation and test set. I do not think that a smaller validation (or test) dataset will negatively
# impact the generalization ability of the chosen models, because I am only using rather simple ML models
# with few hyperparamaters.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Creating a complete training dataset with X_train and y_train
train_df = X_train.copy()
train_df["rating"] = y_train

test_df = X_test.copy()
test_df["rating"] = y_test

In [7]:
training_set = np.array(train_df)
training_set = training_set.astype("int")
test_set = np.array(train_df)
test_set = test_set.astype("int")

In [8]:
training_set[:,1][training_set[:,0] == 1]

array([1035,  594,  938,  914, 1836, 1029,  595,  260,  919, 1961, 1207,
        745, 2028, 1270, 1962, 3186,  720, 2692, 1197, 2804, 2321, 2762,
        661, 2797,  150, 3105, 2791, 1193, 1907, 2918, 1287, 1022,  783,
       1246, 1545, 2355,  608,  531, 2018, 1566, 1721,  588, 3114])

In [9]:
#take max users id in train and test data
nb_users = int(max(max(training_set[:, 0]), max(test_set[:, 0])))
nb_movies =  int(max(max(training_set[:, 1]), max(test_set[:, 1])))
print(nb_users, nb_movies)

6040 3952


In [11]:
def convert(data):
    new_data = []
    for id_users in range(1, nb_users + 1):
        ##id of movies that is rated by current users
        id_movies = data[:,1][data[:,0] == id_users]
        
        ##rate of movies that is given by current user
        id_ratings = data[:,2][data[:,0] == id_users]
        
        #inialize ratings for all movies
        #set 0 for movies that are not rated by current users
        ratings = np.zeros(nb_movies)
        #movie id starts from 1, 1st movie will be 1st element in rating with index as 0
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data

In [12]:
training_set = torch.FloatTensor(convert(training_set))
test_set = torch.FloatTensor(convert(test_set))

In [21]:
class RBM():

    def __init__(self, num_visible, num_hidden, k, learning_rate=1e-3, momentum_coefficient=0.5, weight_decay=1e-4, use_cuda=True):
        self.num_visible = num_visible
        self.num_hidden = num_hidden
        self.k = k
        self.learning_rate = learning_rate
        self.momentum_coefficient = momentum_coefficient
        self.weight_decay = weight_decay
        self.use_cuda = use_cuda

        self.weights = torch.randn(num_visible, num_hidden) * 0.1
        self.visible_bias = torch.ones(num_visible) * 0.5
        self.hidden_bias = torch.zeros(num_hidden)

        self.weights_momentum = torch.zeros(num_visible, num_hidden)
        self.visible_bias_momentum = torch.zeros(num_visible)
        self.hidden_bias_momentum = torch.zeros(num_hidden)

    def sample_hidden(self, visible_probabilities):
        hidden_activations = torch.matmul(visible_probabilities, self.weights) + self.hidden_bias
        hidden_probabilities = self._sigmoid(hidden_activations)
        return hidden_probabilities

    def sample_visible(self, hidden_probabilities):
        visible_activations = torch.matmul(hidden_probabilities, self.weights.t()) + self.visible_bias
        visible_probabilities = self._sigmoid(visible_activations)
        return visible_probabilities

    def contrastive_divergence(self, input_data):
        # Positive phase
        positive_hidden_probabilities = self.sample_hidden(input_data)
        positive_hidden_activations = (positive_hidden_probabilities >= self._random_probabilities(self.num_hidden)).float()
        positive_associations = torch.matmul(input_data.t(), positive_hidden_activations)

        # Negative phase
        hidden_activations = positive_hidden_activations

        for step in range(self.k):
            visible_probabilities = self.sample_visible(hidden_activations)
            visible_probabilities[input_data == 0] = 0
            hidden_probabilities = self.sample_hidden(visible_probabilities)
            hidden_activations = (hidden_probabilities >= self._random_probabilities(self.num_hidden)).float()

        negative_visible_probabilities = visible_probabilities
        negative_hidden_probabilities = hidden_probabilities

        negative_associations = torch.matmul(negative_visible_probabilities.t(), negative_hidden_probabilities)

        # Update parameters
        self.weights_momentum *= self.momentum_coefficient
        self.weights_momentum += (positive_associations - negative_associations)

        self.visible_bias_momentum *= self.momentum_coefficient
        self.visible_bias_momentum += torch.sum(input_data - negative_visible_probabilities, dim=0)

        self.hidden_bias_momentum *= self.momentum_coefficient
        self.hidden_bias_momentum += torch.sum(positive_hidden_probabilities - negative_hidden_probabilities, dim=0)

        batch_size = input_data.size(0)
        self.weights += self.weights_momentum * self.learning_rate / batch_size
        self.visible_bias += self.visible_bias_momentum * self.learning_rate / batch_size
        self.hidden_bias += self.hidden_bias_momentum * self.learning_rate / batch_size

        self.weights -= self.weights * self.weight_decay  # L2 weight decay

        # Compute reconstruction error
        target = torch.flatten(input_data)[torch.flatten(input_data) > 0]
        predict = torch.flatten(negative_visible_probabilities)[torch.flatten(input_data) > 0]
        error = torch.sum((target - predict) ** 2)
        return error
 
    def _sigmoid(self, x):
        return 1 / (1 + torch.exp(-x))

    def _random_probabilities(self, num):
        random_probabilities = torch.rand(num)
        return random_probabilities
    
    def save(self):
        torch.save(self.weights, "./rbm/weights.pt")
        torch.save(self.visible_bias, "./rbm/visible_bias.pt")
        torch.save(self.hidden_bias, "./rbm/hidden_bias.pt")
    
    def load(self):
        self.weights = torch.load("./rbm/weights.pt", map_location=lambda storage, loc: storage)
        self.visible_bias = torch.load("./rbm/visible_bias.pt", map_location=lambda storage, loc: storage)
        self.hidden_bias = torch.load("./rbm/hidden_bias.pt", map_location=lambda storage, loc: storage)

In [23]:
import math 

print('<Info> Creating the architecture of the Neural Network')
# Creating the architecture of the Neural Network
nv = len(training_set[0])
nh = 125
batch_size = 128
rbm = RBM(nv, nh, 10, 5e-3)
rbm.load()
# Training the RBM
# nb_epoch = 50
# prev_train_loss = 0
# s = float(torch.sum(training_set > 0))
# for epoch in range(1, nb_epoch + 1):
#     train_loss = 0
    
#     for id_user in range(0, nb_users - batch_size, batch_size):
#         batch = training_set[id_user:id_user+batch_size].to(device) / 5.0
#         batch_error = rbm.contrastive_divergence(batch)
#         train_loss += batch_error
#     train_loss = math.sqrt(train_loss / s)
#     print('<Info> epoch: '+str(epoch)+' loss: '+str(train_loss))
#     if abs(train_loss - prev_train_loss) < 1e-6:
#         break
#     prev_train_loss = train_loss



<Info> Creating the architecture of the Neural Network


In [25]:
print('<Info> Testing the RBM')
test_loss = 0
s = float(torch.sum(test_set > 0))
count = 0
for id_user in range(nb_users):
    # Use the training set to activate neurons  
    v = test_set[id_user:id_user+1] / 5.0
    h = rbm.sample_hidden(v)
    vt = rbm.sample_visible(h)
    v = torch.flatten(v)
    vt = torch.flatten(vt)[v>0]
    v = v[v>0]
    test_loss += torch.sum((vt - v) ** 2)
test_loss = math.sqrt(test_loss / s)        
print('<Info> Test loss: '+ str(test_loss * 5))
print('<Info> Completed.')

<Info> Testing the RBM
<Info> Test loss: 0.9482224046900545
<Info> Completed.


In [57]:
films_df = pd.read_csv("./films.csv")
nb_genres = len(films_df.columns) - 2
films_df = films_df.set_index('movie_id')
films_df.drop(columns=["title"], inplace=True)
new_index = np.array(range(1, 3953))
films_df = films_df.reindex(new_index, fill_value=0)
films_matrix = np.array(films_df)
films_matrix = films_matrix.astype('int')

In [107]:
from tqdm import tqdm
users_rating_matrix = np.zeros((nb_users, nb_genres))
users_prefer_matrix = np.zeros((nb_users, nb_genres))
training_array = training_set.numpy()
print(films_matrix.shape, training_array.shape, movies_df.size)
for i in tqdm(range(nb_users)):
    ratings = training_array[i][training_array[i] > 0].reshape(-1, 1)
    films = films_matrix[training_array[i] > 0]
    films = films * ratings
    films_r = np.zeros(films.shape[1]).astype('float64')
    films_p = np.sum(films > 0, axis=0) / float(films.shape[0])
    for j in range(18):
        if np.sum(films[:,j] > 0) > 0:
            films_r[j] = np.argmax(np.bincount(films[:,j][films[:,j] > 0].astype('int')))
    users_prefer_matrix[i] = films_p
    users_rating_matrix[i] = films_r
    

(3952, 18) (6040, 3952) 11649


100%|██████████| 6040/6040 [00:01<00:00, 3967.57it/s]


In [119]:
print('<Info> Testing the RBM')
test_loss = 0
s = float(torch.sum(test_set > 0))
count = 0
for id_user in range(nb_users):
    # Use the training set to activate neurons  
    v = test_set[id_user].numpy()
    pred = np.sum(films_matrix[v > 0] * users_prefer_matrix[id_user], axis=1)
    v = v[v > 0]
    test_loss += np.sum((pred - v) ** 2)
test_loss = math.sqrt(test_loss / s)        
print('<Info> Test loss: '+ str(test_loss * 5))
print('<Info> Completed.')

<Info> Testing the RBM
<Info> Test loss: 10.567309644929423
<Info> Completed.


In [125]:
def test_combine(alpha, beta):
    print(alpha, beta)
    test_loss = 0
    s = float(torch.sum(test_set > 0))
    for id_user in range(nb_users):
        # Use the training set to activate neurons  
        v = test_set[id_user:id_user+1] / 5.0
        h = rbm.sample_hidden(v)
        vt = rbm.sample_visible(h)

        vt = torch.flatten(vt)[torch.flatten(v)>0].numpy() * 5

        v = v[0].numpy() * 5
        vp = np.sum(films_matrix[v > 0] * users_prefer_matrix[id_user], axis=1)
        v = v[v > 0]

        pred = alpha * vt + beta * vp 

        test_loss += np.sum((pred - v) ** 2)
    test_loss = math.sqrt(test_loss / s)        
    print('Test loss: '+ str(test_loss))

In [127]:
test_combine(0.89443, 0.11716)

0.89443 0.11716
Test loss: 0.9453826824972438
