In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import MeanSquaredError
import warnings
warnings.filterwarnings('ignore')

In [68]:
movies_metadata = pd.read_csv('movies_metadata.csv')
keywords = pd.read_csv('keywords.csv')
credits = pd.read_csv('credits.csv')
links_small = pd.read_csv('links_small.csv')
ratings_small = pd.read_csv('ratings_small.csv')

In [69]:
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,10/30/1995,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,0.0,7.7,5415.0
1,0,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,12/15/1995,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,0.0,6.9,2413.0
2,0,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,12/22/1995,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,0.0,6.5,92.0
3,0,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,12/22/1995,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,0.0,6.1,34.0
4,0,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,2/10/1995,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,0.0,5.7,173.0


In [70]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [71]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [72]:
links_small.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [73]:
ratings_small.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [74]:
# Change the datatype of ID columns to numeric
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'], errors='coerce')
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
links_small['tmdbId'] = pd.to_numeric(links_small['tmdbId'], errors='coerce')
ratings_small['movieId'] = pd.to_numeric(ratings_small['movieId'], errors='coerce')

# Drop rows with missing id values
movies_metadata.dropna(subset=['id'], inplace=True)
keywords.dropna(subset=['id'], inplace=True)
credits.dropna(subset=['id'], inplace=True)
links_small.dropna(subset=['tmdbId'], inplace=True)
ratings_small.dropna(subset=['movieId'], inplace=True)
                             
# Merge datasets
metadata_keywords = pd.merge(movies_metadata, keywords, on='id')
metadata_credits = pd.merge(movies_metadata, credits, on='id')
metadata_links = pd.merge(movies_metadata, links_small, left_on='id', right_on='tmdbId')

# Merge all datasets to one
comprehensive_df = pd.concat([movies_metadata, ratings_small], ignore_index=True)
comprehensive_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,userId,movieId,rating,timestamp
0,0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,0.0,7.7,5415.0,,,,
1,0,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,0.0,6.9,2413.0,,,,
2,0,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602.0,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,0.0,6.5,92.0,,,,
3,0,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357.0,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,0.0,6.1,34.0,,,,
4,0,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862.0,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,0.0,5.7,173.0,,,,


In [75]:
# Split data into features and target
X = comprehensive_df[['userId', 'movieId']]
y = comprehensive_df['rating']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
# Determine the number of unique users and movies
num_users = X['userId'].nunique()
num_movies = X['movieId'].nunique()

# Define the dimensions for user and movie embeddings
embedding_dim = 32  

# Define the input layers for user and movie IDs
user_input = Input(shape=(1,))
movie_input = Input(shape=(1,))

# Embedding layers to represent users and movies in a lower-dimensional space
user_embedding = Embedding(input_dim=num_users, output_dim=embedding_dim)(user_input)
movie_embedding = Embedding(input_dim=num_movies, output_dim=embedding_dim)(movie_input)

# Flatten the embedding layers to prepare for concatenation
user_flatten = Flatten()(user_embedding)
movie_flatten = Flatten()(movie_embedding)

# Concatenate user and movie embeddings
concat = Concatenate()([user_flatten, movie_flatten])

# Fully connected layer with ReLU activation function
dense1 = Dense(128, activation='relu')(concat)

# Output layer to predict ratings
output = Dense(1)(dense1)


1. Loading Data:

* We first loaded the datasets movies_metadata.csv, keywords.csv, credits.csv, links_small.csv, and ratings_small.csv into pandas DataFrames.

2. Data Cleaning:

* We then changed the datatype of ID columns (id, tmdbId, movieId) to numeric to ensure consistency and compatibility. And dropped rows with missing ID values to avoid any issues during merging.
  
3. Merging Datasets: 

* We merged the datasets based on common ID columns to create comprehensive dataframes as below:
    * Merge movies_metadata with keywords on the id column to include keyword information.
    * Merge movies_metadata with credits on the id column to include credits information.
    * Merge movies_metadata with links_small on the id and tmdbId columns to include links information.
    * Concatenate movies_metadata with ratings_small along rows to include ratings information.

4. Data Preprocessing:

* Split the data into features (X) and target (y) where features are the user IDs and movie IDs, and the target is the ratings.
* Split the data into training and testing sets using train_test_split.
* Determine the number of unique users and movies in the dataset.
* Define the embedding dimensions for user and movie embeddings.
* Define the model architecture using TensorFlow's Keras API. This includes defining input layers, embedding layers, flattening layers, concatenation layer, dense layers, and output layer.

***By following these steps, we ensure that the data is cleaned, merged, and prepared in a format suitable for building a recommender system. Each dataset contributes relevant information that can enhance the recommendations provided to users based on their preferences and interactions with movies.***



In [77]:
# Build model
model = Model(inputs=[user_input, movie_input], outputs=output)

# Compile the model with mean squared error loss and mean squared error as the metric
model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=[MeanSquaredError()])


In [78]:
# Filter rows where userId and movieId are within the expected range
# by creating a boolean mask based on the conditions for valid indices
valid_indices = (X_train['userId'] >= 0) & (X_train['userId'] < num_users) & \
                (X_train['movieId'] >= 0) & (X_train['movieId'] < num_movies)

# Apply the boolean mask to filter the training data and labels
X_train_filtered = X_train[valid_indices]
y_train_filtered = y_train[valid_indices]

# Train the model using the filtered data
model.fit([X_train_filtered['userId'], X_train_filtered['movieId']], y_train_filtered, 
          epochs=10, batch_size=64, validation_split=0.1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1551dae8940>

### Interpret results

Looking at the training and validation performance metrics, I can see that the loss and mean squared error (MSE) values gradually decrease over the epochs for both the training and validation datasets. This indicates that the model is learning and improving its predictive capabilities over time.

However, it's important to note that towards the later epochs, especially around epochs 8 to 10, the validation loss and MSE start to slightly increase, while the training loss and MSE continue to decrease. This divergence between the training and validation metrics suggests that the model might be starting to overfit the training data, meaning it is becoming too specialized and less able to generalize to unseen dat data.

### Conclusion/recommendations


Based on the training and validation performance of the model, it's evident that the model has the capacity to learn from the data and make predictions. However, there are indications of potential overfitting towards the later epochs, as seen from the divergence between the training and validation metrics.

To address this issue and enhance the model's generalization ability, I recommend implementing regularization techniques such as dropout or L2 regularization. These techniques can help prevent the model from becoming too specialized to the training data and improve its ability to generalize to unseen data.

Additionally, fine-tuning the model architecture and exploring different hyperparameters such as learning rates or optimizer settings could further enhance the model's performance.

Furthermore, increasing the amount of training data available could also contribute to improving the model's generalization performance by providing more diverse examples for the model to learn from.

Overall, by incorporating these recommendations and iteratively evaluating the model's performance, we can strive to develop a robust recommender system that provides accurate and reliable recommendations for users.