<a href="https://colab.research.google.com/github/Syam-2023/ADDA-Lab/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎬 Movie Recommendation System with Posters

This notebook builds a content-based movie recommendation system using the TMDB 5000 Movie Dataset and TMDB API for fetching posters.

In [1]:
# Install necessary libraries if not already installed
!pip install -q requests

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import ast
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Image, display

In [7]:
# Upload your CSV files manually in Colab
from google.colab import files
uploaded = files.upload()

Saving tmdb_5000_credits.csv to tmdb_5000_credits.csv
Saving tmdb_5000_movies.csv to tmdb_5000_movies.csv


In [33]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [32]:
movies.shape

(4806, 8)

In [8]:
# Load data
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge on title
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']].dropna()

In [9]:
# Helper functions for preprocessing
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def get_director(obj):
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            return i['name']
    return ''

def get_top_3_cast(obj):
    L = []
    count = 0
    for i in ast.literal_eval(obj):
        if count < 3:
            L.append(i['name'])
            count += 1
    return L

In [14]:
# Apply preprocessing
# Modify the convert function to handle lists directly and strings robustly
def convert(obj):
    if isinstance(obj, list):
        return [i['name'] for i in obj if isinstance(i, dict) and 'name' in i]
    elif isinstance(obj, str):
        try:
            list_obj = ast.literal_eval(obj)
            if isinstance(list_obj, list):
                 return [i['name'] for i in list_obj if isinstance(i, dict) and 'name' in i]
            else:
                return [] # Handle cases where string evaluates to something other than a list
        except (ValueError, SyntaxError, TypeError): # Catch TypeError as well
            return [] # Handle cases where string is not a valid list representation or causes other errors
    return []

def get_director(obj):
    if isinstance(obj, list):
        for i in obj:
            if isinstance(i, dict) and i.get('job') == 'Director':
                return i.get('name', '')
    elif isinstance(obj, str):
        try:
            list_obj = ast.literal_eval(obj)
            if isinstance(list_obj, list):
                for i in list_obj:
                    if isinstance(i, dict) and i.get('job') == 'Director':
                        return i.get('name', '')
        except (ValueError, SyntaxError, TypeError):
            pass # Handle cases where string is not a valid list representation or causes other errors
    return ''


def get_top_3_cast(obj):
    L = []
    count = 0
    if isinstance(obj, list):
        for i in obj:
            if count < 3 and isinstance(i, dict) and 'name' in i:
                L.append(i['name'])
                count += 1
    elif isinstance(obj, str):
        try:
            list_obj = ast.literal_eval(obj)
            if isinstance(list_obj, list):
                for i in list_obj:
                    if count < 3 and isinstance(i, dict) and 'name' in i:
                        L.append(i['name'])
                        count += 1
        except (ValueError, SyntaxError, TypeError):
            pass # Handle cases where string is not a valid list representation or causes other errors
    return L


movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(get_top_3_cast)
movies['crew'] = movies['crew'].apply(get_director)
movies['crew'] = movies['crew'].apply(lambda x: [x] if isinstance(x, str) and x else []) # Ensure crew is a list, handle empty string

# Concatenate the overview (as string) and the string representation of the lists
movies['tags'] = movies['overview'].astype(str) + " " + movies['genres'].astype(str) + " " + movies['keywords'].astype(str) + " " + movies['cast'].astype(str) + " " + movies['crew'].astype(str)

# Convert the entire 'tags' string to lowercase
movies['tags'] = movies['tags'].apply(lambda x: x.lower())

new_df = movies[['movie_id', 'title', 'tags']]

In [16]:
# Vectorization
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

# Similarity matrix
similarity = cosine_similarity(vectors)

In [26]:
# 🔑 Enter your TMDB API key below:
API_KEY = 'your_tmdb_api_key_here'  # Replace with your TMDB API key

# Function to fetch movie poster
def fetch_poster(movie_id):
    url = f"https://api.themoviedb.org/3/movie/50?api_key=e676693aef7f4e0c1d6fb654b4372107&language=en-US"
    response = requests.get(url)
    if response.status_code != 200:
        return ""
    data = response.json()
    return "https://image.tmdb.org/t/p/w500" + data.get('poster_path', '') # Use .get() to handle potential missing key

In [15]:
# Recommendation function
def recommend(movie):
    movie = movie.lower()
    if movie not in new_df['title'].str.lower().values:
        return [], []

    idx = new_df[new_df['title'].str.lower() == movie].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])[1:6]

    recommended_movies = []
    recommended_posters = []

    for i in distances:
        movie_id = new_df.iloc[i[0]].movie_id
        recommended_movies.append(new_df.iloc[i[0]].title)
        recommended_posters.append(fetch_poster(movie_id))

    return recommended_movies, recommended_posters

In [28]:
# Display recommendations with posters
def show_recommendations(movie_name):
    names, posters = recommend(movie_name)
    if not names:
        print("Movie not found.")
        return

    for i in range(len(names)):
        print(f"{i+1}. {names[i]}")
        if posters[i]: # Check if the poster URL is not an empty string
            display(Image(posters[i]))
        else:
            print("Poster not available.")

In [31]:
# 🎯 Test it
show_recommendations('Dune')

1. Stealth
Poster not available.
2. Enemy at the Gates
Poster not available.
3. Project Almanac
Poster not available.
4. 40 Days and 40 Nights
Poster not available.
5. Superman II
Poster not available.


In [34]:
! pip install streamlit -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m104.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [44]:
!wget -q -O - ipv4.icanhazip.com

34.81.30.8


In [45]:
! streamlit run app.py & npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.81.30.8:8501[0m
[0m
[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0Kyour url is: https://free-apes-lie.loca.lt
2025-07-07 15:02:44.848 The `use_column_width` parameter has been deprecated and will be removed in a future release. Please utilize the `use_container_width` parameter instead.
2025-07-07 15:02:45.170 The `use_column_width` parameter has been deprecated and will be removed in a future release. Please utilize the `use_container_width` parameter instead.
2025-07-07 15:02:45.382 The `use_column_width` parameter has been deprecated and will be removed in a future rele

In [41]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import ast
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import io

# Helper functions for preprocessing (corrected)
def convert(obj):
    if isinstance(obj, list):
        return [i['name'] for i in obj if isinstance(i, dict) and 'name' in i]
    elif isinstance(obj, str):
        try:
            list_obj = ast.literal_eval(obj)
            if isinstance(list_obj, list):
                 return [i['name'] for i in list_obj if isinstance(i, dict) and 'name' in i]
            else:
                return [] # Handle cases where string evaluates to something other than a list
        except (ValueError, SyntaxError, TypeError): # Catch TypeError as well
            return [] # Handle cases where string is not a valid list representation or causes other errors
    return []

def get_director(obj):
    if isinstance(obj, list):
        for i in obj:
            if isinstance(i, dict) and i.get('job') == 'Director':
                return i.get('name', '')
    elif isinstance(obj, str):
        try:
            list_obj = ast.literal_eval(obj)
            if isinstance(list_obj, list):
                for i in list_obj:
                    if isinstance(i, dict) and i.get('job') == 'Director':
                        return i.get('name', '')
        except (ValueError, SyntaxError, TypeError):
            pass # Handle cases where string is not a valid list representation or causes other errors
    return ''

def get_top_3_cast(obj):
    L = []
    count = 0
    if isinstance(obj, list):
        for i in obj:
            if count < 3 and isinstance(i, dict) and 'name' in i:
                L.append(i['name'])
                count += 1
    elif isinstance(obj, str):
        try:
            list_obj = ast.literal_eval(obj)
            if isinstance(list_obj, list):
                for i in list_obj:
                    if count < 3 and isinstance(i, dict) and 'name' in i:
                        L.append(i['name'])
                        count += 1
        except (ValueError, SyntaxError, TypeError):
            pass # Handle cases where string is not a valid list representation or causes other errors
    return L

# Function to fetch movie poster
def fetch_poster(movie_id, api_key):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US"
    response = requests.get(url)
    if response.status_code != 200:
        return ""
    data = response.json()
    return "https://image.tmdb.org/t/p/w500" + data.get('poster_path', '')

# Load data (using st.cache_data for efficiency)
@st.cache_data
def load_data():
    movies = pd.read_csv('tmdb_5000_movies.csv')
    credits = pd.read_csv('tmdb_5000_credits.csv')
    movies = movies.merge(credits, on='title')
    return movies

# Prepare model (using st.cache_data for efficiency)
@st.cache_data
def prepare_model(movies):
    movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']].dropna()

    movies['genres'] = movies['genres'].apply(convert)
    movies['keywords'] = movies['keywords'].apply(convert)
    movies['cast'] = movies['cast'].apply(get_top_3_cast)
    movies['crew'] = movies['crew'].apply(get_director)
    movies['crew'] = movies['crew'].apply(lambda x: [x] if isinstance(x, str) and x else []) # Ensure crew is a list, handle empty string

    # Concatenate the overview (as string) and the string representation of the lists
    movies['tags'] = movies['overview'].astype(str) + " " + movies['genres'].astype(str) + " " + movies['keywords'].astype(str) + " " + movies['cast'].astype(str) + " " + movies['crew'].astype(str)

    # Convert the entire 'tags' string to lowercase
    movies['tags'] = movies['tags'].apply(lambda x: x.lower())


    new_df = movies[['movie_id', 'title', 'tags']]

    cv = CountVectorizer(max_features=5000, stop_words='english')
    vectors = cv.fit_transform(new_df['tags']).toarray()
    similarity = cosine_similarity(vectors)

    return new_df, similarity

# Recommendation function
def recommend(movie, new_df, similarity):
    movie = movie.lower()
    if movie not in new_df['title'].str.lower().values:
        return [], []

    idx = new_df[new_df['title'].str.lower() == movie].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])[1:6]

    recommended_movies = []
    recommended_posters = []

    for i in distances:
        movie_id = new_df.iloc[i[0]].movie_id
        recommended_movies.append(new_df.iloc[i[0]].title)
        # Pass the API key to the fetch_poster function
        recommended_posters.append(fetch_poster(movie_id, st.session_state.api_key))

    return recommended_movies, recommended_posters

# Streamlit App
st.title('🎬 Movie Recommendation System')

# API Key Input
api_key = st.text_input("🔑 Enter your TMDB API Key", type="password")
if api_key:
    st.session_state.api_key = api_key

# Load data and prepare model only if API key is provided
if 'api_key' in st.session_state and st.session_state.api_key:
    movies = load_data()
    new_df, similarity = prepare_model(movies)
    movie_list = new_df['title'].values
    selected_movie = st.selectbox("Select a movie", movie_list)

    if st.button('Show Recommendations'):
        names, posters = recommend(selected_movie, new_df, similarity)
        if names:
            st.subheader("Recommended Movies:")
            for i in range(len(names)):
                st.write(f"{i+1}. {names[i]}")
                if posters[i]:
                    try:
                        # Fetch image data and display using PIL
                        image_data = requests.get(posters[i]).content
                        img = Image.open(io.BytesIO(image_data))
                        st.image(img, caption=names[i], use_column_width=True)
                    except Exception as e:
                        st.write(f"Could not display poster for {names[i]}: {e}")
                else:
                    st.write("Poster not available.")
        else:
            st.write("Could not find recommendations for this movie.")
else:
    st.warning("Please enter your TMDB API Key to get recommendations.")

Overwriting app.py
