Movie Recommendation System

 Users often struggle to find movies that align with their tastes or preferences. With the increasing volume of content, providing personalized recommendations has become a critical feature for enhancing user experience and engagement.

In [None]:
'''Creating a movie recommendation project using The Movie Database (TMDb) API involves several steps,
1.including data retrieval,
2.data processing,
3.and building the recommendation algorithm.'''

# 1. Setup and Data Retrieval



import requests
import pandas as pd

api_key = '5552246a1142026c763b3f08f23dea10'

def fetch_movie_data(movie_id):
    url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}'
    response = requests.get(url)
    return response.json()


In [None]:

# Example to fetch data for multiple movies
movie_ids = list(range(1, 10000)) # feteching the data from 1 to 10000 range
movies = [fetch_movie_data(movie_id) for movie_id in movie_ids]

# Convert the movie data to a DataFrame
df = pd.DataFrame(movies)
print(df.head())

  success  status_code                                  status_message  adult  \
0   False         34.0  The resource you requested could not be found.    NaN   
1     NaN          NaN                                             NaN  False   
2     NaN          NaN                                             NaN  False   
3   False         34.0  The resource you requested could not be found.    NaN   
4     NaN          NaN                                             NaN  False   

                      backdrop_path belongs_to_collection     budget  \
0                               NaN                   NaN        NaN   
1  /hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg                  None        0.0   
2  /l94l89eMmFKh7na2a1u5q67VgNx.jpg                  None        0.0   
3                               NaN                   NaN        NaN   
4  /f2t4JbUvQIjUF5FstG1zZFAp02N.jpg                  None  4000000.0   

                                              genres  \
0                       

In [None]:
# Extracting the relevent information and sotre in the dictionary and convert them in data frame

def preprocess_movie_data(movies):
    movie_list = []
    for movie in movies:
        if 'id' in movie and 'title' in movie and 'genres' in movie and 'popularity' in movie and 'vote_average' in movie and 'overview' in movie:
            movie_info = {
                'id': movie['id'],
                'title': movie['title'],
                'genres': [genre['name'] for genre in movie['genres']],
                'popularity': movie['popularity'],
                'vote_average': movie['vote_average'],
                'overview': movie['overview'],
            }
            movie_list.append(movie_info)
    return pd.DataFrame(movie_list)

# Preprocess the fetched movie data
df = preprocess_movie_data(movies)
print(df.head())


   id                             title                           genres  \
0   2                             Ariel  [Drama, Comedy, Romance, Crime]   
1   3               Shadows in Paradise         [Comedy, Drama, Romance]   
2   5                        Four Rooms                         [Comedy]   
3   6                    Judgment Night        [Action, Crime, Thriller]   
4   8  Life in Loops (A Megacities RMX)                    [Documentary]   

   popularity  vote_average                                           overview  
0      16.275         7.100  After the coal mine he works at closes and his...  
1      16.204         7.300  Nikander, a rubbish collector and would-be ent...  
2      31.170         5.836  It's Ted the Bellhop's first night on the job....  
3      20.581         6.488  Four young friends, while taking a shortcut en...  
4       3.440         7.300  Timo Novotny labels his new project an experim...  


In [None]:
# Print the first movie in the list to inspect its structure
print(movies[0])


{'success': False, 'status_code': 34, 'status_message': 'The resource you requested could not be found.'}


In [None]:
### 3. Building the Recommendation System

'''There are various approaches to build a recommendation system, including content-based filtering and collaborative filtering. Here, we'll start with a content-based filtering approach using movie genres and overviews.'''

#### Content-Based Filtering
'''Create a function to calculate similarity between movies:'''

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

def compute_similarity(df):
    # Combine genres and overview for content-based filtering
    df['content'] = df['genres'].apply(lambda x: ' '.join(x)) + ' ' + df['overview']

    # Use TF-IDF Vectorizer to convert text data into numerical data
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['content'])

    # Compute cosine similarity between movies
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_sim


In [None]:

# Calculate similarity matrix
cosine_sim = compute_similarity(df)

# Function to get movie recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

# Get recommendations for a movie
recommendations = get_recommendations('Fight Club')#Fight Club
print(recommendations)

916     Me and You and Everyone We Know
1896                         Angel Baby
4101                          Interview
4196                   The Fashionistas
450                      The Experiment
3059                    Finders Keepers
2630                      The Machinist
1610                              Kafka
3814                     Running Scared
365                            The Hole
Name: title, dtype: object


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import numpy as np

In [None]:
# Save raw data, preprocessed data, cosine similarity matrix, and recommendations to Google Drive
df_raw = pd.DataFrame(movies)
df_raw.to_csv('/content/drive/My Drive/tmdb_raw_data.csv', index=False)
df.to_csv('/content/drive/My Drive/tmdb_preprocessed_data.csv', index=False)
np.save('/content/drive/My Drive/tmdb_cosine_sim.npy', cosine_sim)
recommendations.to_csv('/content/drive/My Drive/tmdb_recommendations.csv', index=False)

print("Data saved to Google Drive.")

Data saved to Google Drive.


In [None]:
/content/drive/MyDrive/tmdb_cosine_sim.npy

In [None]:
import numpy as np

# Load the saved cosine similarity matrix
cosine_sim = np.load('/content/drive/MyDrive/tmdb_cosine_sim.npy')


In [None]:
import pandas as pd

# Load preprocessed movie data
df_preprocessed = pd.read_csv('/content/drive/MyDrive/tmdb_preprocessed_data.csv')

# Optionally, load raw data if needed
df_raw = pd.read_csv('/content/drive/MyDrive/tmdb_raw_data.csv')


In [None]:
def get_recommendations(title, df, cosine_sim):
    # Ensure 'title' column is present and indexed correctly
    if 'title' not in df.columns:
        raise ValueError("DataFrame must contain 'title' column")

    # Convert title to lowercase for case-insensitive matching
    title_lower = title.lower()
    titles_lower = df['title'].str.lower()

    # Check if the title exists in the DataFrame
    if title_lower not in titles_lower.values:
        return f"Movie '{title}' not found in the dataset."

    # Find the index of the movie that matches the title
    idx = df[df['title'].str.lower() == title_lower].index[0]

    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar movies
    sim_scores = sim_scores[1:11]  # Skip the first one since it's the movie itself
    movie_indices = [i[0] for i in sim_scores]

    # Return the titles of the top 10 most similar movies
    return df['title'].iloc[movie_indices]

# Example usage
recommendations = get_recommendations('Fight Club', df_preprocessed, cosine_sim)
print("Recommendations for 'Fight Club':")
print(recommendations)


Recommendations for 'Fight Club':
916     Me and You and Everyone We Know
1896                         Angel Baby
4101                          Interview
4196                   The Fashionistas
450                      The Experiment
3059                    Finders Keepers
2630                      The Machinist
1610                              Kafka
3814                     Running Scared
365                            The Hole
Name: title, dtype: object


In [None]:
tmdb_raw_data = pd.read_csv('/content/drive/MyDrive/tmdb_raw_data.csv')

In [None]:
tmdb_raw_data

Unnamed: 0,success,status_code,status_message,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,34.0,The resource you requested could not be found.,,,,,,,,...,,,,,,,,,,
1,,,,False,/hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,2.0,...,1988-10-21,0.0,73.0,"[{'english_name': 'Finnish', 'iso_639_1': 'fi'...",Released,,Ariel,False,7.100,326.0
2,,,,False,/l94l89eMmFKh7na2a1u5q67VgNx.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,3.0,...,1986-10-17,0.0,74.0,"[{'english_name': 'Finnish', 'iso_639_1': 'fi'...",Released,,Shadows in Paradise,False,7.300,369.0
3,False,34.0,The resource you requested could not be found.,,,,,,,,...,,,,,,,,,,
4,,,,False,/f2t4JbUvQIjUF5FstG1zZFAp02N.jpg,,4000000.0,"[{'id': 35, 'name': 'Comedy'}]",https://www.miramax.com/movie/four-rooms/,5.0,...,1995-12-09,4257354.0,98.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False,5.836,2591.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,,,,False,/9h90ga8OYzcHAVHCuwRuFYQQNm.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,9995.0,...,2000-09-06,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,How you gonna win?,Turn It Up,False,4.900,20.0
9995,False,34.0,The resource you requested could not be found.,,,,,,,,...,,,,,,,,,,
9996,,,,False,/381VJtVELz6gH7NwsLNVeXHIxVp.jpg,,200000.0,"[{'id': 14, 'name': 'Fantasy'}, {'id': 27, 'na...",,9997.0,...,2007-11-15,1395610.0,109.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Far From Grace,Gabriel,False,6.006,392.0
9997,,,,False,,,0.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,9998.0,...,2004-01-01,0.0,91.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,In The Soul Of A Woman Beats The Heart Of A Hero.,Forbidden Warrior,False,4.094,16.0


In [None]:
tmdb_preprocessed_data = pd.read_csv('/content/drive/MyDrive/tmdb_preprocessed_data.csv')

In [None]:
tmdb_preprocessed_data

Unnamed: 0,id,title,genres,popularity,vote_average,overview,content
0,2,Ariel,"['Drama', 'Comedy', 'Romance', 'Crime']",16.275,7.100,After the coal mine he works at closes and his...,Drama Comedy Romance Crime After the coal mine...
1,3,Shadows in Paradise,"['Comedy', 'Drama', 'Romance']",16.204,7.300,"Nikander, a rubbish collector and would-be ent...","Comedy Drama Romance Nikander, a rubbish colle..."
2,5,Four Rooms,['Comedy'],31.170,5.836,It's Ted the Bellhop's first night on the job....,Comedy It's Ted the Bellhop's first night on t...
3,6,Judgment Night,"['Action', 'Crime', 'Thriller']",20.581,6.488,"Four young friends, while taking a shortcut en...","Action Crime Thriller Four young friends, whil..."
4,8,Life in Loops (A Megacities RMX),['Documentary'],3.440,7.300,Timo Novotny labels his new project an experim...,Documentary Timo Novotny labels his new projec...
...,...,...,...,...,...,...,...
5133,9994,The Great Mouse Detective,"['Animation', 'Family', 'Adventure', 'Mystery']",32.132,7.058,When the diabolical Professor Ratigan kidnaps ...,Animation Family Adventure Mystery When the di...
5134,9995,Turn It Up,"['Action', 'Crime', 'Drama']",6.180,4.900,Trying to bootstrap his way out of Brooklyn's ...,Action Crime Drama Trying to bootstrap his way...
5135,9997,Gabriel,"['Fantasy', 'Horror', 'Action', 'Science Ficti...",24.690,6.006,Gabriel tells the story of an archangel who fi...,Fantasy Horror Action Science Fiction Gabriel ...
5136,9998,Forbidden Warrior,"['Adventure', 'Fantasy', 'Science Fiction']",5.079,4.094,The Gaia Za is a sacred tome that holds many a...,Adventure Fantasy Science Fiction The Gaia Za ...


In [11]:
import numpy as np
import pandas as pd

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
cosine_sim = np.load("/content/drive/MyDrive/tmdb_cosine_sim.npy")

In [14]:
cosine_sim

array([[1.        , 0.01901056, 0.00666977, ..., 0.02875595, 0.        ,
        0.03025197],
       [0.01901056, 1.        , 0.05501074, ..., 0.        , 0.        ,
        0.01232843],
       [0.00666977, 0.05501074, 1.        , ..., 0.03755522, 0.        ,
        0.01906178],
       ...,
       [0.02875595, 0.        , 0.03755522, ..., 1.        , 0.03183917,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.03183917, 1.        ,
        0.        ],
       [0.03025197, 0.01232843, 0.01906178, ..., 0.        , 0.        ,
        1.        ]])

In [15]:
df_preprocessed = pd.read_csv("/content/drive/MyDrive/tmdb_preprocessed_data.csv")

In [16]:
pip install gradio



In [17]:
def get_recommendations(title, df, cosine_sim):
    # Ensure 'title' column is present and indexed correctly
    if 'title' not in df.columns:
        raise ValueError("DataFrame must contain 'title' column")

    # Convert title to lowercase for case-insensitive matching
    title_lower = title.lower()
    titles_lower = df['title'].str.lower()

    # Check if the title exists in the DataFrame
    if title_lower not in titles_lower.values:
        return f"Movie '{title}' not found in the dataset."

    # Find the index of the movie that matches the title
    idx = df[df['title'].str.lower() == title_lower].index[0]

    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top 10 most similar movies
    sim_scores = sim_scores[1:11]  # Skip the first one since it's the movie itself
    movie_indices = [i[0] for i in sim_scores]

    # Return the titles of the top 10 most similar movies
    return df['title'].iloc[movie_indices]

# Example usage
recommendations = get_recommendations('Fight Club', df_preprocessed, cosine_sim)
print("Recommendations for 'Fight Club':")
print(recommendations)

Recommendations for 'Fight Club':
916     Me and You and Everyone We Know
1896                         Angel Baby
4101                          Interview
4196                   The Fashionistas
450                      The Experiment
3059                    Finders Keepers
2630                      The Machinist
1610                              Kafka
3814                     Running Scared
365                            The Hole
Name: title, dtype: object


In [18]:
pip install gradio



In [19]:
import gradio as gr

# Function to get movie recommendations
def get_recommendations(title, df, cosine_sim):
    if 'title' not in df.columns:
        raise ValueError("DataFrame must contain 'title' column")

    title_lower = title.lower()
    titles_lower = df['title'].str.lower()

    if title_lower not in titles_lower.values:
        return f"Movie '{title}' not found in the dataset."

    idx = df[df['title'].str.lower() == title_lower].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[movie_indices].tolist()

# Load your preprocessed DataFrame (df_preprocessed) and cosine similarity matrix (cosine_sim)
# Example: df_preprocessed, cosine_sim = load_data()

# Gradio interface
def recommend_movie_ui(title):
    recommendations = get_recommendations(title, df_preprocessed, cosine_sim)
    return recommendations

# Create the interface with Gradio
interface = gr.Interface(
    fn=recommend_movie_ui,
    inputs="text",
    outputs="text",
    title="Movie Recommendation System",
    description="Enter a movie title to get recommendations for similar movies."
)

# Launch the interface
interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bb10ca6f9c51fd763d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


