In [342]:
import tensorflow as tf 
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd

In [343]:
import pandas as pd

file_path = 'movie_lens/movies.dat'
column_names = ['MovieID', 'Title', 'Genres']


movies_df = pd.read_csv(file_path, sep='::', names=column_names, encoding='latin1', engine='python')

print(movies_df.head())


   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [344]:
rating_cols = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings_df = pd.read_csv('movie_lens/ratings.dat', sep='::', names=rating_cols,
                         encoding ='latin1', engine='python')
print(ratings_df.head())

   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [345]:
movies_df['List_Index']=movies_df.index
print(movies_df.head())

   MovieID                               Title                        Genres  \
0        1                    Toy Story (1995)   Animation|Children's|Comedy   
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy   
2        3             Grumpier Old Men (1995)                Comedy|Romance   
3        4            Waiting to Exhale (1995)                  Comedy|Drama   
4        5  Father of the Bride Part II (1995)                        Comedy   

   List_Index  
0           0  
1           1  
2           2  
3           3  
4           4  


In [346]:
movies_df.loc[movies_df['List_Index']==4]

Unnamed: 0,MovieID,Title,Genres,List_Index
4,5,Father of the Bride Part II (1995),Comedy,4


In [347]:
movie_ratings = movies_df.merge(ratings_df, on='MovieID')

In [348]:
movie_ratings.head()

Unnamed: 0,MovieID,Title,Genres,List_Index,UserID,Rating,Timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,0,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,0,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,0,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,0,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,0,10,5,978226474


In [349]:
movie_ratings = movie_ratings.drop('Timestamp', axis = 1).drop('Title', axis=1).drop('Genres', axis=1)

In [350]:
movie_ratings.head()

Unnamed: 0,MovieID,List_Index,UserID,Rating
0,1,0,1,5
1,1,0,6,4
2,1,0,8,4
3,1,0,9,5
4,1,0,10,5


In [351]:
user_group = movie_ratings.groupby('UserID')


In [352]:
user_group.head()

Unnamed: 0,MovieID,List_Index,UserID,Rating
0,1,0,1,5
1,1,0,6,4
2,1,0,8,4
3,1,0,9,5
4,1,0,10,5
...,...,...,...,...
874890,3285,3216,1270,1
883271,3357,3288,4896,5
903916,3448,3379,4025,5
928813,3548,3479,986,3


In [367]:
no_of_users = 3999
train_X = []

In [368]:
for userID, currentUser in user_group:
    temp= [0] * len(movies_df)
    for num, movie in currentUser.iterrows():
        temp[movie['List_Index']] = movie['Rating']/5.0
    train_X.append(temp)
    if no_of_users == 0:
        break
    else:
        no_of_users -=1


In [369]:
print(len(train_X))

4000


In [370]:
class RBM:
    def __init__(self, n_visible, n_hidden): 
        self.n_visible = n_visible
        self.n_hidden = n_hidden
        self.v0_state = tf.Variable(tf.zeros([n_visible]), tf.float32)
        self.h0_state = tf.Variable(tf.zeros([n_hidden]), tf.float32)
        self.hb = tf.Variable(tf.zeros([n_hidden]), tf.float32)
        self.vb = tf.Variable(tf.zeros([n_visible]), tf.float32)
        self.W = tf.Variable(tf.zeros([n_visible, n_hidden]), tf.float32)
        self.errors = []
        self.weights = []
        

    def hidden_layer(self, v_state):
        h_prob = tf.nn.sigmoid(tf.matmul([v_state], self.W) + self.hb) 
        h_state = tf.nn.relu(tf.sign(h_prob - tf.random.uniform(tf.shape(h_prob)))) 
        return h_state

    def reconstructed_output(self, h_state):
        v_prob = tf.nn.sigmoid(tf.matmul(h_state, tf.transpose(self.W)) + self.vb) 
        v_state = tf.nn.relu(tf.sign(v_prob - tf.random.uniform(tf.shape(v_prob))))
        return v_state[0]

    def predict_reconstruct(self, h_state):
        v_prob = tf.nn.sigmoid(tf.matmul(h_state, tf.transpose(self.W)) + self.vb)
        return v_prob

    def error(self, v0_state, v1_state):
        return tf.reduce_mean(tf.square(v0_state - v1_state))

    def train(self, train_ds, learning_rate=0.01, epochs=5, batchsize=32, K=1):
        batch_number = 0
         # Set K to 1 for now
        
        for epoch in range(epochs):
            for batch_x in train_ds: 
                batch_number += 1
                for i_sample in range(batchsize):
                    for k in range(K):
                            v0_state = batch_x[i_sample]
                            h0_state = self.hidden_layer(v0_state)
                            v1_state = self.reconstructed_output(h0_state)
                            h1_state = self.hidden_layer(v1_state)

                            delta_W = tf.matmul(tf.transpose([v0_state]), h0_state) - tf.matmul(tf.transpose([v1_state]), h1_state)
                            self.W = self.W + learning_rate * delta_W
                        
                            delta_vb = tf.reduce_mean(v0_state - v1_state, axis=0)
                            delta_hb = tf.reduce_mean(h0_state - h1_state, axis=0) 
                            self.vb = self.vb + learning_rate * delta_vb
                            self.hb = self.hb + learning_rate * delta_hb
                            
                            v0_state = v1_state

                    if i_sample == batchsize - 1:
                        err = self.error(batch_x[i_sample], v1_state)
                        self.errors.append(err)
                        self.weights.append(self.W)
                        print('Epoch: %d' % epoch, "batch #: %i " % batch_number, 
                              "of %i" % int(60e3 / batchsize), "sample #: %i" % i_sample,
                              'reconstruction error: %f' % err)
    def save_model(self, filepath):
        variables = { 'W':self.W.numpy(), 
                     'vb':self.vb.numpy(), 
                     'hb':self.hb.numpy()}
        np.savez(filepath, **variables)

    def plot_error(self):
        plt.plot(self.errors)
        plt.xlabel("Batch Number")
        plt.ylabel("Error")
        plt.title("Training Error Curve")
        plt.show()
    
    def plot_weight_matrix(self):
        plt.imshow(self.W.numpy(), cmap='viridis', aspect='auto')
        plt.colorbar()
        plt.title("Weight Matrix Visualization")
        plt.xlabel("Hidden Units")
        plt.ylabel("Visible Units")
        plt.show()
        
    def predict(self, input_data):
        reconstructed_data = []
        for sample in input_data:
            v0_state = sample
            h0_state = self.hidden_layer(v0_state)
            v1_state = self.predict_reconstruct(h0_state)
            v0_state = v1_state
            reconstructed_data.append(v1_state.numpy())
        return np.array(reconstructed_data)
    
    @classmethod
    def load_model(cls, filepath):
        data = np.load(filepath)
        model = cls(data['W'].shape[0], data['W'].shape[1])
        model.W.assign(data['W'])
        model.vb.assign(data['vb'])
        model.hb.assign(data['hb'])
        return model 
 


In [371]:
train_X = tf.Variable(train_X, dtype=tf.float32)

2024-02-29 19:34:48.850409: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 62128000 exceeds 10% of free system memory.


In [372]:
train_X.shape


TensorShape([4000, 3883])

In [373]:
movie_model = RBM(len(movies_df), 50)

In [374]:
print(train_X[:5])

tf.Tensor(
[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(5, 3883), dtype=float32)


In [375]:
batchsize = 100
train_ds = \
    tf.data.Dataset.from_tensor_slices(train_X).batch(batchsize).prefetch(tf.data.experimental.AUTOTUNE)

2024-02-29 19:35:03.063411: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 62128000 exceeds 10% of free system memory.


In [376]:
for batch in train_ds: 
    print("Batch_shape:", batch.shape)


Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)
Batch_shape: (100, 3883)


2024-02-29 19:35:03.543826: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 62128000 exceeds 10% of free system memory.


In [363]:
#for excluding last batch
train_ds = tf.data.Dataset.from_tensor_slices(train_X[:-1]).batch(batchsize).prefetch(tf.data.experimental.AUTOTUNE)


In [364]:
#for last batch size 
last_batch_size = train_X.shape[0] % batchsize
padding = batchsize - last_batch_size
padded_data = np.pad(train_X[-last_batch_size:], ((0, padding), (0, 0)), mode='constant')
train_ds = tf.data.Dataset.from_tensor_slices(np.concatenate([train_X[:-last_batch_size], padded_data])).batch(batchsize).prefetch(tf.data.experimental.AUTOTUNE)


In [390]:
movie_model.train(train_ds, epochs=15, learning_rate = 0.01,K = 3)

Epoch: 0 batch #: 1  of 1875 sample #: 31 reconstruction error: 0.011259
Epoch: 0 batch #: 2  of 1875 sample #: 31 reconstruction error: 0.019253
Epoch: 0 batch #: 3  of 1875 sample #: 31 reconstruction error: 0.019882
Epoch: 0 batch #: 4  of 1875 sample #: 31 reconstruction error: 0.024445
Epoch: 0 batch #: 5  of 1875 sample #: 31 reconstruction error: 0.017337
Epoch: 0 batch #: 6  of 1875 sample #: 31 reconstruction error: 0.009446
Epoch: 0 batch #: 7  of 1875 sample #: 31 reconstruction error: 0.009065
Epoch: 0 batch #: 8  of 1875 sample #: 31 reconstruction error: 0.046201
Epoch: 0 batch #: 9  of 1875 sample #: 31 reconstruction error: 0.036127
Epoch: 0 batch #: 10  of 1875 sample #: 31 reconstruction error: 0.008694
Epoch: 0 batch #: 11  of 1875 sample #: 31 reconstruction error: 0.030636
Epoch: 0 batch #: 12  of 1875 sample #: 31 reconstruction error: 0.042771
Epoch: 0 batch #: 13  of 1875 sample #: 31 reconstruction error: 0.038764
Epoch: 0 batch #: 14  of 1875 sample #: 31 reco

In [378]:
inputUser = train_X.numpy()[50]

2024-02-29 19:36:46.039457: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 62128000 exceeds 10% of free system memory.


In [379]:
inputUser.shape

(3883,)

In [380]:
inputUser = inputUser.reshape(1, -1)
inputUser.shape

(1, 3883)

In [412]:
recommended_output = this_model.predict(inputUser)

In [413]:
print(recommended_output[0])

[[0.14043872 0.00303023 0.00067504 ... 0.0041432  0.00309724 0.05604251]]


In [414]:
recommended_output[0].reshape(-1)

array([0.14043872, 0.00303023, 0.00067504, ..., 0.0041432 , 0.00309724,
       0.05604251], dtype=float32)

In [415]:
scored_movies_df_50 = movies_df
scored_movies_df_50["Recommendation Score"] = recommended_output[0].reshape(-1)

In [416]:
print(scored_movies_df_50)

      MovieID                               Title  \
0           1                    Toy Story (1995)   
1           2                      Jumanji (1995)   
2           3             Grumpier Old Men (1995)   
3           4            Waiting to Exhale (1995)   
4           5  Father of the Bride Part II (1995)   
...       ...                                 ...   
3878     3948             Meet the Parents (2000)   
3879     3949          Requiem for a Dream (2000)   
3880     3950                    Tigerland (2000)   
3881     3951             Two Family House (2000)   
3882     3952               Contender, The (2000)   

                            Genres  List_Index  Recommendation Score  
0      Animation|Children's|Comedy           0              0.140439  
1     Adventure|Children's|Fantasy           1              0.003030  
2                   Comedy|Romance           2              0.000675  
3                     Comedy|Drama           3              0.000461  
4       

In [417]:
print(scored_movies_df_50.sort_values(["Recommendation Score"], ascending=False).head(20))

      MovieID                                  Title  \
3682     3751                     Chicken Run (2000)   
3509     3578                       Gladiator (2000)   
3257     3326       What Planet Are You From? (2000)   
2789     2858                 American Beauty (1999)   
3724     3793                           X-Men (2000)   
2286     2355                   Bug's Life, A (1998)   
2530     2599                        Election (1999)   
3554     3623           Mission: Impossible 2 (2000)   
3045     3114                     Toy Story 2 (1999)   
3751     3821  Nutty Professor II: The Klumps (2000)   
3339     3408                 Erin Brockovich (2000)   
3686     3755              Perfect Storm, The (2000)   
3487     3556            Virgin Suicides, The (1999)   
1582     1625                       Game, The (1997)   
3757     3827                   Space Cowboys (2000)   
3106     3175                    Galaxy Quest (1999)   
3229     3298                     Boiler Room (2

In [402]:
print(np.amax(recommended_output[0]) * 5)

4.229137301445007


In [403]:
print(np.amin(recommended_output[0]))

1.7183354e-06


In [405]:
# movie_model.plot_error()

In [393]:
movie_model.save_model('movie_recommendation_temp.npz')

In [394]:
this_model = RBM.load_model('movie_recommendation_temp.npz')

In [395]:
this_model.predict(inputUser)

array([[[0.17312321, 0.00283724, 0.00073856, ..., 0.00137197,
         0.00222418, 0.01066607]]], dtype=float32)