In [20]:
#importing and Preprocessing the data 
#We need to clean and prepare the data for both MF and NCF models


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, accuracy_score
from scipy.sparse.linalg import svds

# Load the datasets
users = pd.read_csv(r'D:\unitec\MachineLearningCourse\movie_Recommedation\ml-1m\users.dat', 
                    sep='::', engine='python', 
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], 
                    encoding='ISO-8859-1')

movies = pd.read_csv(r'D:\unitec\MachineLearningCourse\movie_Recommedation\ml-1m\movies.dat', 
                     sep='::', engine='python', 
                     names=['MovieID', 'Title', 'Genres'], 
                     encoding='ISO-8859-1')

ratings = pd.read_csv(r'D:\unitec\MachineLearningCourse\movie_Recommedation\ml-1m\ratings.dat', 
                      sep='::', engine='python', 
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'], 
                      encoding='ISO-8859-1')

# Check the loaded data
print("Users Data:\n", users.head())
print("Movies Data:\n", movies.head())
print("Ratings Data:\n", ratings.head())

# Create a pivot table for Matrix Factorization (MF) model
ratings_pivot = ratings.pivot_table(index='UserID', columns='MovieID', values='Rating')

# Fill missing values with 0 (could be done with other techniques too, but here we use 0 for simplicity)
ratings_pivot = ratings_pivot.fillna(0)

# Split into train and test sets
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)


Users Data:
    UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
Movies Data:
    MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
Ratings Data:
    UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [23]:
#Matrix Factorization (MF) - SVD
#Singular Value Decomposition (SVD). This is a traditional matrix factorization method for recommendation systems.

# Perform SVD on the ratings matrix 

# Create user and movie ID mappings to indices
user_mapping = {user_id: index for index, user_id in enumerate(ratings_pivot.index)}
movie_mapping = {movie_id: index for index, movie_id in enumerate(ratings_pivot.columns)}

# Updated prediction function using the mappings
def predict_rating(user_id, movie_id):
    user_index = user_mapping.get(user_id)  # Map user_id to index
    movie_index = movie_mapping.get(movie_id)  # Map movie_id to index
    if user_index is not None and movie_index is not None:
        return predicted_ratings[user_index, movie_index]  # Access the predicted rating
    else:
        return 0  # Return a default value if user or movie is not in the matrix

# Evaluate the model
y_true = test_data['Rating']
y_pred = [predict_rating(row['UserID'], row['MovieID']) for index, row in test_data.iterrows()]

# Calculate evaluation metrics
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print("MF Model MSE: ", mse)
print("MF Model R²: ", r2)

# Calculate confusion matrix and accuracy (using a simple threshold for binary classification)
threshold = 3.5
y_pred_binary = [1 if pred >= threshold else 0 for pred in y_pred]
y_true_binary = [1 if rating >= threshold else 0 for rating in y_true]

cm = confusion_matrix(y_true_binary, y_pred_binary)
accuracy = accuracy_score(y_true_binary, y_pred_binary)

print("Confusion Matrix:\n", cm)
print("Accuracy: ", accuracy)


MF Model MSE:  4.98856493164911
MF Model R²:  -2.9787799185725476
Confusion Matrix:
 [[82168  2712]
 [94680 20482]]
Accuracy:  0.5131422401295728


In [25]:
#Before using the UserID and MovieID in the model, we need to map them to zero-based indices.
# Map original UserID and MovieID to zero-indexed values
user_map = {user_id: idx for idx, user_id in enumerate(ratings['UserID'].unique())}
movie_map = {movie_id: idx for idx, movie_id in enumerate(ratings['MovieID'].unique())}

# Re-index the user and movie columns in the ratings data
ratings['user_idx'] = ratings['UserID'].map(user_map)
ratings['movie_idx'] = ratings['MovieID'].map(movie_map)

# Check the changes
print(ratings[['UserID', 'MovieID', 'user_idx', 'movie_idx']].head())


   UserID  MovieID  user_idx  movie_idx
0       1     1193         0          0
1       1      661         0          1
2       1      914         0          2
3       1     3408         0          3
4       1     2355         0          4


In [26]:
# Now, we need to update the training and testing datasets to use the new user_idx and movie_idx.
# Split data into train and test sets (80% train, 20% test)
from sklearn.model_selection import train_test_split

X = ratings[['user_idx', 'movie_idx']].values
y = ratings['Rating'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(800167, 2) (200042, 2) (800167,) (200042,)


In [27]:
# Define the model, including user and movie embeddings
# Number of unique users and movies after re-indexing
n_users = len(user_map)
n_movies = len(movie_map)


user_input = Input(shape=(1,), name='user_input')
movie_input = Input(shape=(1,), name='movie_input')

user_embedding = Embedding(input_dim=n_users, output_dim=n_factors, name='user_embedding')(user_input)
movie_embedding = Embedding(input_dim=n_movies, output_dim=n_factors, name='movie_embedding')(movie_input)

user_flat = Flatten()(user_embedding)
movie_flat = Flatten()(movie_embedding)

concat = Concatenate()([user_flat, movie_flat])

dense = Dense(128, activation='relu')(concat)
dense = Dense(64, activation='relu')(dense)
dense = Dense(32, activation='relu')(dense)
output = Dense(1)(dense)

model = Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])


In [29]:
#Train the model, using the user_idx and movie_idx and the updated ratings.
history = model.fit([X_train[:, 0], X_train[:, 1]], y_train, epochs=10, batch_size=64, validation_data=([X_test[:, 0], X_test[:, 1]], y_test))


Epoch 1/10
[1m12503/12503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 9ms/step - accuracy: 0.0563 - loss: 0.6399 - val_accuracy: 0.0570 - val_loss: 0.7682
Epoch 2/10
[1m12503/12503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 9ms/step - accuracy: 0.0560 - loss: 0.6006 - val_accuracy: 0.0570 - val_loss: 0.7767
Epoch 3/10
[1m12503/12503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 9ms/step - accuracy: 0.0560 - loss: 0.5676 - val_accuracy: 0.0570 - val_loss: 0.7866
Epoch 4/10
[1m12503/12503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 8ms/step - accuracy: 0.0561 - loss: 0.5373 - val_accuracy: 0.0570 - val_loss: 0.7956
Epoch 5/10
[1m12503/12503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 8ms/step - accuracy: 0.0559 - loss: 0.5093 - val_accuracy: 0.0570 - val_loss: 0.8103
Epoch 6/10
[1m12503/12503[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 9ms/step - accuracy: 0.0556 - loss: 0.4834 - val_accuracy: 0.0570 - val_loss:

In [30]:
# Predict ratings for the test set
y_pred = model.predict([X_test[:, 0], X_test[:, 1]])

# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("R²:", r2)

# Convert ratings to binary values (e.g., ratings >= 3.5 are positive)
threshold = 3.5
y_pred_binary = (y_pred >= threshold).astype(int)
y_test_binary = (y_test >= threshold).astype(int)

# Confusion matrix and accuracy
cm = confusion_matrix(y_test_binary, y_pred_binary)
accuracy = accuracy_score(y_test_binary, y_pred_binary)

print("Confusion Matrix:\n", cm)
print("Accuracy:", accuracy)


[1m6252/6252[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step
MSE: 0.8832063651378971
R²: 0.29557228088378906
Confusion Matrix:
 [[55809 29071]
 [26881 88281]]
Accuracy: 0.7202987372651743


In [31]:
print(type(movies))  # Should output <class 'pandas.core.frame.DataFrame'>
print(top_movie_ids)

<class 'pandas.core.frame.DataFrame'>
[661 495 878 994 836]


In [32]:
# Predict ratings for a specific user
user_id = 0
movie_ids = np.array(list(range(n_movies)))

predictions = model.predict([np.full(movie_ids.shape, user_id), movie_ids])

# Recommend top 5 movies
top_movie_ids = predictions.flatten().argsort()[::-1][:5]
recommended_movies = movies.iloc[top_movie_ids]

print("Recommended Movies:\n", recommended_movies[['Title', 'Genres']])


[1m 71/116[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 2ms/step



[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Recommended Movies:
                          Title          Genres
1414  Meet Wally Sparks (1997)          Comedy
1777       Nil By Mouth (1997)           Drama
1567                187 (1997)           Drama
2350        Extremities (1986)  Drama|Thriller
2442  Long Goodbye, The (1973)           Crime


In [33]:
# Let's predict a rating for user with id 0 for movie with id 100
user_id = 0
movie_id = 100

# Predict the rating by calculating the dot product of user and movie latent factors
predicted_rating = np.dot(user_factors[user_id, :], movie_factors[movie_id, :])

print(f"Predicted Rating for User {user_id} and Movie {movie_id}: {predicted_rating}")


Predicted Rating for User 0 and Movie 100: 0.029850271996437007


In [34]:
# Predict ratings for all movies for a specific user
user_id = 0
predicted_ratings = np.dot(user_factors[user_id, :], movie_factors.T)

# Get top 5 movie recommendations
top_movie_ids = predicted_ratings.argsort()[::-1][:5]

# Retrieve the titles of the top recommended movies
recommended_movies = movies.iloc[top_movie_ids]

print("Top 5 Recommended Movies for User 0:\n", recommended_movies[['Title', 'Genres']])


Top 5 Recommended Movies for User 0:
                               Title                       Genres
0                  Toy Story (1995)  Animation|Children's|Comedy
2898           Bad Seed, The (1956)               Drama|Thriller
581   Brady Bunch Movie, The (1995)                       Comedy
513               Rising Sun (1993)         Action|Drama|Mystery
2162                Rounders (1998)                  Crime|Drama


In [25]:
# Predict ratings for all movies for a specific user
user_id = 0
movie_ids = np.array(list(range(n_movies)))  # all movie IDs

# Predict ratings for each movie
predictions = model.predict([np.full(movie_ids.shape, user_id), movie_ids])

# Get top 5 movie recommendations
top_movie_ids = predictions.flatten().argsort()[::-1][:5]

# Retrieve the titles of the top recommended movies
recommended_movies = movies.iloc[top_movie_ids]

print("Top 5 Recommended Movies for User 0:\n", recommended_movies[['Title', 'Genres']])


[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Top 5 Recommended Movies for User 0:
                                               Title  \
346                              Client, The (1994)   
106                                  Catwalk (1995)   
309                  Stuart Saves His Family (1995)   
1108                          On Golden Pond (1981)   
2931  Princess Mononoke, The (Mononoke Hime) (1997)   

                          Genres  
346       Drama|Mystery|Thriller  
106                  Documentary  
309                       Comedy  
1108                       Drama  
2931  Action|Adventure|Animation  


In [26]:
def recommend_movies_svd(user_id, num_recommendations=5):
    # Predict ratings for all movies for the user
    predicted_ratings = np.dot(user_factors[user_id, :], movie_factors.T)
    
    # Get the top movie recommendations
    top_movie_ids = predicted_ratings.argsort()[::-1][:num_recommendations]
    
    # Retrieve the recommended movie titles
    recommended_movies = movies.iloc[top_movie_ids]
    return recommended_movies[['Title', 'Genres']]

# Example: Recommend top 5 movies for User 10
recommended = recommend_movies_svd(user_id=10)
print(recommended)


                                Title                              Genres
593               Pretty Woman (1990)                      Comedy|Romance
2557         Edge of Seventeen (1998)                Comedy|Drama|Romance
2651          Inspector Gadget (1999)  Action|Adventure|Children's|Comedy
1107      Perfect Candidate, A (1996)                         Documentary
579   Dear Diary (Caro Diario) (1994)                        Comedy|Drama


In [29]:
def recommend_movies_ncf(user_id, num_recommendations=5):
    # Predict ratings for all movies for the user
    movie_ids = np.array(list(range(n_movies)))
    predictions = model.predict([np.full(movie_ids.shape, user_id), movie_ids])
    
    # Get the top movie recommendations
    top_movie_ids = predictions.flatten().argsort()[::-1][:num_recommendations]
    
    # Retrieve the recommended movie titles
    recommended_movies = movies.iloc[top_movie_ids]
    return recommended_movies[['Title', 'Genres']]

recommended = recommend_movies_ncf(user_id=4)
print(recommended)


[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
                                                Title       Genres
2158                               Lodger, The (1926)     Thriller
713   Haunted World of Edward D. Wood Jr., The (1995)  Documentary
3021                                   Matewan (1987)        Drama
2809                                Hell Night (1981)       Horror
862      Shadow of Angels (Schatten der Engel) (1976)        Drama
