### Recommender System using Collaborative Filtering Model using Neural Networks

In [215]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder #Label Encoding for User and Movie Ids

from keras.models import Model
from keras.layers import Input, Embedding, Flatten, concatenate, Dense
from keras.optimizers import Adam # Adam optimizer

In [216]:
df_movies = pd.read_csv('movies.csv').head(500000)
df_ratings = pd.read_csv('ratings.csv').head(500000)

In [217]:
#Merging movies and their ratings given
dfMerged = pd.merge(df_ratings, df_movies, on='movieId')

In [218]:
dfMerged

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,3,296,5.0,1439474476,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,4,296,4.0,1573938898,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,5,296,4.0,830786155,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,7,296,4.0,835444730,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
...,...,...,...,...,...,...
499995,3445,6084,3.0,1468322768,Honkytonk Man (1982),Comedy|Drama
499996,3445,6403,3.5,1569359316,"Swimmer, The (1968)",Drama
499997,3445,7272,3.0,1453068509,Super Fly (Superfly) (1972),Action|Crime|Drama
499998,3445,7992,3.5,1455930501,Cockfighter (1974),Drama


In [219]:
# Label encoding of IDs
userEncoder = LabelEncoder()
movieEncoder = LabelEncoder()

In [220]:
dfMerged.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,3,296,5.0,1439474476,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2,4,296,4.0,1573938898,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,5,296,4.0,830786155,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,7,296,4.0,835444730,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller


In [221]:
dfMerged['userEncoded'] = userEncoder.fit_transform(dfMerged['userId'])
dfMerged['movieEncoded'] = movieEncoder.fit_transform(dfMerged['movieId'])

In [222]:
dfMerged

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,userEncoded,movieEncoded
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0,282
1,3,296,5.0,1439474476,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,2,282
2,4,296,4.0,1573938898,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,3,282
3,5,296,4.0,830786155,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4,282
4,7,296,4.0,835444730,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,6,282
...,...,...,...,...,...,...,...,...
499995,3445,6084,3.0,1468322768,Honkytonk Man (1982),Comedy|Drama,3444,5402
499996,3445,6403,3.5,1569359316,"Swimmer, The (1968)",Drama,3444,5654
499997,3445,7272,3.0,1453068509,Super Fly (Superfly) (1972),Action|Crime|Drama,3444,6371
499998,3445,7992,3.5,1455930501,Cockfighter (1974),Drama,3444,6719


In [223]:
# Splitting for training and testing, random state = 10, for portable results
train_data, test_data = train_test_split(dfMerged, test_size=0.3) 

In [224]:
# Defining the Neural Network
def recommendationModel(Vu, Vm, embeddSize=50):
    # User
    userInput = Input(shape=(1,), name="userInput") # Taking the user Input
    userEmbed = Embedding(input_dim=Vu, output_dim=embeddSize, input_length=1)(userInput) # User data embedding based on embed size - Hidden layer
    userEmbed = Flatten()(userEmbed) # Flatten the userEmbed
    
    # Movie (similar)
    movieInput = Input(shape=(1,), name="movieInput") # Taking the movie Input
    movieEmbed = Embedding(input_dim=Vm, output_dim=embeddSize, input_length=1)(movieInput) # movie data embedding based on embed size - Hidden layer
    movieEmbed = Flatten()(movieEmbed) # Flatten the movieEmbed
    
    # Concatenate the embeddings
    concat = concatenate([userEmbed, movieEmbed])
    
    # Fully connected Dense layers
    D1 = Dense(128, activation='relu')(concat) # This signifies previous layer
    D2 = Dense(64, activation='relu')(D1)
    D3 = Dense(32, activation='relu')(D2)
    D4 = Dense(16, activation='relu')(D3)
    
    # Output layer for 0-5 rating
    output = Dense(1, activation='linear')(D4)
    
    # Model definition
    model = Model([userInput, movieInput], output)
    model.compile(optimizer=Adam(), loss='mean_squared_error')
    
    return model


In [225]:
# Acquire the unique users and movies
Vu = dfMerged['userEncoded'].nunique()
Vm = dfMerged['movieEncoded'].nunique()

In [226]:
# Create the model
model = recommendationModel(Vu, Vm)
model.summary

<bound method Model.summary of <keras.src.engine.functional.Functional object at 0x000001ACBE61BDC0>>

In [227]:
from keras.callbacks import ModelCheckpoint

# Create a ModelCheckpoint callback
checkpoint_path = "best_model.h5"
checkpoint_callback = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)


model.fit([train_data['userEncoded'], train_data['movieEncoded']], train_data['rating'],
          validation_data=([test_data['userEncoded'], test_data['movieEncoded']], test_data['rating']), epochs=10, batch_size=64, verbose=1,callbacks=checkpoint_callback)



Epoch 1/10
Epoch 1: val_loss improved from inf to 0.76957, saving model to best_model.h5
Epoch 2/10
   5/5469 [..............................] - ETA: 1:15 - loss: 0.6786

  saving_api.save_model(


Epoch 2: val_loss improved from 0.76957 to 0.72984, saving model to best_model.h5
Epoch 3/10
Epoch 3: val_loss did not improve from 0.72984
Epoch 4/10
Epoch 4: val_loss improved from 0.72984 to 0.72380, saving model to best_model.h5
Epoch 5/10
Epoch 5: val_loss did not improve from 0.72380
Epoch 6/10
Epoch 6: val_loss did not improve from 0.72380
Epoch 7/10
Epoch 7: val_loss did not improve from 0.72380
Epoch 8/10
Epoch 8: val_loss did not improve from 0.72380
Epoch 9/10
Epoch 9: val_loss did not improve from 0.72380
Epoch 10/10
Epoch 10: val_loss did not improve from 0.72380


<keras.src.callbacks.History at 0x1acbe443c70>

## Getting a sample prediction

In [236]:
# Example: Get predictions for user with user_id = 1 and movie_id = 100
user_id = 1
movie_id = 307

# Encode user and movie IDs
user_encoded = userEncoder.transform([user_id])[0]
movie_encoded = movieEncoder.transform([movie_id])[0]

# Make predictions using the trained model
prediction = model.predict([np.array([user_encoded]), np.array([movie_encoded])])[0][0]
movieTitle = df_movies.loc[df_movies['movieId'] == movie_id,'title'].values[0]
print(f"Predicted rating for user {user_id} and movie \"{movieTitle}\": {prediction: 0.3f}", end=" ")
for i in range(int(prediction)):
    print("⭐", end=" ")

Predicted rating for user 1 and movie "Three Colors: Blue (Trois couleurs: Bleu) (1993)":  4.672 ⭐ ⭐ ⭐ ⭐ 