<a href="https://colab.research.google.com/github/Sneya9205/Internship/blob/main/2022506062_SneyaGabreate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Content-Based Movie Similarity Using TensorFlow: Finding Similar Movies with Title Embeddings

Install and Import

In [29]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs


Loading the data

In [30]:
# Load movie metadata.
movies = tfds.load('movielens/100k-movies', split="train")

# Extract movie titles.
movies = movies.map(lambda x: x["movie_title"])

# Create movie title vocabulary.
movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies)


Model Definition

In [32]:
class MovieSimilarityModel(tf.keras.Model):
    def __init__(self, movie_model):
        super().__init__()
        self.movie_model = movie_model

    def call(self, inputs):
        # Get embeddings for movie titles.
        movie_embeddings = self.movie_model(inputs)
        # Normalize embeddings for cosine similarity.
        normalized_embeddings = tf.math.l2_normalize(movie_embeddings, axis=1)
        return normalized_embeddings

# Movie model using embeddings.
movie_model = tf.keras.Sequential([
    movie_titles_vocabulary,
    tf.keras.layers.Embedding(movie_titles_vocabulary.vocabulary_size(), 64)
])

# Create similarity model instance.
similarity_model = MovieSimilarityModel(movie_model)


Evaluation

In [35]:
# Sample query movie title.
query_movie = np.array(["From Dusk Till Dawn (1996)"])

# Embedding function for the query.
query_embeddings = similarity_model(query_movie)

# Calculate similarity scores using cosine similarity.
similarity_scores = tf.linalg.matmul(query_embeddings, tf.transpose(similarity_model.movie_model.weights[1]))

# Get indices of most similar movies.
top_k = tf.math.top_k(similarity_scores, k=5)

# Convert movies dataset to a list of movie titles.
movie_titles_list = list(movies.as_numpy_iterator())

# Extract movie titles from the list using indices.
top_movie_titles = [movie_titles_list[idx] for idx in top_movie_indices]

print(f"Top 5 similar movies to '{query_movie[0]}':")
for i, title in enumerate(top_movie_titles):
    print(f"{i+1}: {title}")


Top 5 similar movies to 'From Dusk Till Dawn (1996)':
1: b'Death and the Maiden (1994)'
2: b'Bedknobs and Broomsticks (1971)'
3: b"Some Mother's Son (1996)"
4: b'Notorious (1946)'
5: b'Zeus and Roxanne (1997)'
