In this project, I developed a sophisticated movie recommendation system utilizing The Movie Database (TMDb) API to efficiently retrieve and preprocess movie data, incorporating genres, overviews, and user comments for personalized suggestions. Using techniques such as TF-IDF Vectorizer and cosine similarity, I created a content-based filtering algorithm to accurately measure movie similarity, enhancing the user experience by providing tailored movie recommendations. Additionally, this project includes a Google Colab implementation that mounts Google Drive, imports necessary libraries like pandas, requests, and Gradio, and integrates a Gradio interface to interactively recommend movies and display user reviews based on individual preferences

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import Libraries
import pandas as pd
import requests
import time

# TMDb API Key
api_key = '5552246a1142026c763b3f08f23dea10'

# Function to fetch movie data by ID
def fetch_movie_data(movie_id):
    url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch data for movie ID {movie_id}: {response.status_code}")
        return None

# Function to fetch multiple movies
def fetch_multiple_movies(movie_ids):
    movies = []
    for movie_id in movie_ids:
        movie_data = fetch_movie_data(movie_id)
        if movie_data:
            movies.append(movie_data)
        time.sleep(0.25)  # Rate-limiting to avoid API overload
    return movies

# Function to fetch reviews for a movie
def fetch_movie_reviews(movie_id):
    url = f'https://api.themoviedb.org/3/movie/{movie_id}/reviews?api_key={api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        reviews = response.json().get('results', [])
        return ' '.join([review['content'] for review in reviews[:3]])  # Concatenate top 3 reviews
    else:
        print(f"Failed to fetch reviews for movie ID {movie_id}: {response.status_code}")
        return "No reviews available."

# Add reviews to the movie data
def preprocess_movie_data_with_reviews(movies):
    movie_list = []
    for movie in movies:
        if all(key in movie for key in ['id', 'title', 'genres', 'popularity', 'vote_average', 'overview']):
            reviews = fetch_movie_reviews(movie['id'])  # Fetch reviews for the movie
            movie_info = {
                'id': movie['id'],
                'title': movie['title'],
                'genres': [genre['name'] for genre in movie['genres']],
                'popularity': movie['popularity'],
                'vote_average': movie['vote_average'],
                'overview': movie['overview'],
                'reviews': reviews,
            }
            movie_list.append(movie_info)
    return pd.DataFrame(movie_list)

# Fetch data for movie IDs 1 to 50 (or any range you want)
movie_ids = range(1, 100)  # Adjust range as needed
movies = fetch_multiple_movies(movie_ids)  # Fetch the movies

# Check if movies were fetched correctly
if not movies:
    print("No movies fetched. Check the API key and movie IDs.")
else:
    print(f"Fetched {len(movies)} movies.")

# Create and save the DataFrame with reviews
df = preprocess_movie_data_with_reviews(movies)  # Now `movies` is defined
output_path = '/content/drive/My Drive/tmdb_movies_with_reviews.csv'
df.to_csv(output_path, index=False)  # Save to file
print(f"Data with reviews saved to {output_path}!")


Mounted at /content/drive
Failed to fetch data for movie ID 1: 404
Failed to fetch data for movie ID 4: 404
Failed to fetch data for movie ID 7: 404
Failed to fetch data for movie ID 10: 404
Failed to fetch data for movie ID 23: 404
Failed to fetch data for movie ID 29: 404
Failed to fetch data for movie ID 30: 404
Failed to fetch data for movie ID 31: 404
Failed to fetch data for movie ID 32: 404
Failed to fetch data for movie ID 34: 404
Failed to fetch data for movie ID 36: 404
Failed to fetch data for movie ID 37: 404
Failed to fetch data for movie ID 39: 404
Failed to fetch data for movie ID 40: 404
Failed to fetch data for movie ID 41: 404
Failed to fetch data for movie ID 42: 404
Failed to fetch data for movie ID 43: 404
Failed to fetch data for movie ID 44: 404
Failed to fetch data for movie ID 45: 404
Failed to fetch data for movie ID 46: 404
Failed to fetch data for movie ID 47: 404
Failed to fetch data for movie ID 48: 404
Failed to fetch data for movie ID 49: 404
Failed to f

In [2]:
def recommend_movies_with_reviews(movie_title, top_n=5):
    """
    Recommend movies based on the given movie title and include user reviews.

    Args:
        movie_title (str): Title of the movie to find recommendations for.
        top_n (int): Number of recommendations to return.

    Returns:
        List of recommended movie titles with reviews or an error message if not found.
    """
    try:
        # Find the index of the movie
        idx = df[df['title'].str.lower() == movie_title.lower()].index[0]
        # Get similarity scores for all movies
        sim_scores = list(enumerate(cosine_sim[idx]))
        # Sort movies by similarity score (descending)
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        # Get indices of top_n most similar movies (excluding the input movie itself)
        movie_indices = [i[0] for i in sim_scores[1:top_n + 1]]
        # Return the recommended movie titles and their reviews
        recommendations = []
        for i in movie_indices:
            recommendations.append({
                'title': df['title'].iloc[i],
                'reviews': df['reviews'].iloc[i]
            })
        return recommendations
    except IndexError:
        return f"Movie '{movie_title}' not found in the dataset."


In [3]:
# Gradio Interface with Reviews
!pip install gradio # Install gradio library
import gradio as gr # Import gradio and alias as 'gr'


Collecting gradio
  Downloading gradio-5.13.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.6.0 (from gradio)
  Downloading gradio_client-1.6.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.9.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [4]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import Libraries
import pandas as pd
import requests
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

# TMDb API Key
api_key = '5552246a1142026c763b3f08f23dea10'

# Function to fetch movie data by ID
def fetch_movie_data(movie_id):
    url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Function to fetch multiple movies
def fetch_multiple_movies(movie_ids):
    movies = []
    for movie_id in movie_ids:
        movie_data = fetch_movie_data(movie_id)
        if movie_data:
            movies.append(movie_data)
        time.sleep(0.25)  # Rate-limiting to avoid API overload
    return movies

# Function to fetch reviews for a movie
def fetch_movie_reviews(movie_id):
    url = f'https://api.themoviedb.org/3/movie/{movie_id}/reviews?api_key={api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        reviews = response.json().get('results', [])
        return ' '.join([review['content'] for review in reviews[:3]])  # Top 3 reviews
    else:
        return "No reviews available."

# Add reviews to the movie data
def preprocess_movie_data_with_reviews(movies):
    movie_list = []
    for movie in movies:
        if all(key in movie for key in ['id', 'title', 'genres', 'popularity', 'vote_average', 'overview']):
            reviews = fetch_movie_reviews(movie['id'])
            movie_info = {
                'id': movie['id'],
                'title': movie['title'],
                'genres': [genre['name'] for genre in movie['genres']],
                'popularity': movie['popularity'],
                'vote_average': movie['vote_average'],
                'overview': movie['overview'],
                'reviews': reviews,
            }
            movie_list.append(movie_info)
    return pd.DataFrame(movie_list)

# Fetch data for movie IDs 1 to 50
movie_ids = range(1, 50)
movies = fetch_multiple_movies(movie_ids)

# Check movies
if not movies:
    print("No movies fetched.")
else:
    df = preprocess_movie_data_with_reviews(movies)
    output_path = '/content/drive/My Drive/tmdb_movies_with_reviews.csv'
    df.to_csv(output_path, index=False)
    print(f"Data saved to {output_path}!")

# Create a similarity matrix
def create_similarity_matrix(df):
    df['combined'] = df['genres'].astype(str) + " " + df['overview']
    vectorizer = CountVectorizer()
    count_matrix = vectorizer.fit_transform(df['combined'])
    return cosine_similarity(count_matrix)

cosine_sim = create_similarity_matrix(df)

# Recommend movies
def recommend_movies_with_reviews(movie_title, top_n=5):
    try:
        idx = df[df['title'].str.lower() == movie_title.lower()].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        movie_indices = [i[0] for i in sim_scores[1:top_n + 1]]
        recommendations = []
        for i in movie_indices:
            recommendations.append({
                'title': df['title'].iloc[i],
                'reviews': df['reviews'].iloc[i]
            })
        return recommendations
    except IndexError:
        return f"Movie '{movie_title}' not found in the dataset."

# Gradio Interface
def gradio_recommend_with_reviews(movie_title):
    recommendations = recommend_movies_with_reviews(movie_title)
    if isinstance(recommendations, list):
        output = f"Movies similar to '{movie_title}':\n"
        for rec in recommendations:
            output += f"\nTitle: {rec['title']}\nReviews: {rec['reviews']}\n"
        return output
    else:
        return recommendations

# Create and Launch Gradio Interface
interface = gr.Interface(
    fn=gradio_recommend_with_reviews,
    inputs=gr.Textbox(label="Enter a Movie Title"),
    outputs=gr.Textbox(label="Recommended Movies with Reviews"),
    title="Movie Recommendation System",
    description="Enter a movie title to get recommendations and reviews."
)

interface.launch()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data saved to /content/drive/My Drive/tmdb_movies_with_reviews.csv!
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4ae39630145554a560.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


