Importing the dependencies

In [None]:
# 1. Import dependencies

import numpy as np
import pandas as pd
import difflib
import os
import time
import requests
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import Image, display
from PIL import Image as PILImage
import matplotlib.pyplot as plt


**Explanation:**

- numpy / pandas: Data manipulation

- difflib: For fuzzy string matching (find closest movie names)

- os, Path: File system handling

- requests, time: Fetch posters from TMDB API

- sklearn: Build similarity-based recommendation system

- IPython.display, PIL, matplotlib: Display posters inline in Colab

Data Collection and Pre-Processing

In [None]:
# loading the data from the csv file to a pandas dataframe

movies_data = pd.read_csv('/content/movies.csv')

In [None]:
# printing the first 5 rows of the dataframe
movies_data.head()

In [None]:
# number of rows and columns in the data frame

movies_data.shape

In [None]:
# 3. Clean cast names

def split_cast_names(text):
    """
    Converts a space-separated cast list into proper full names.
    Example: "Sam Worthington Zoe Saldana" -> "Sam Worthington, Zoe Saldana"
    """
    if not isinstance(text, str) or not text.strip():
        return ""

    words = text.split()
    names = []
    current = []

    # Common particles that should stay attached to last name
    particles = {"de", "da", "del", "la", "van", "von", "di", "le", "du", "st.", "st"}

    for i, word in enumerate(words):
        start_new_name = word[0].isupper() and len(current) >= 2 and words[i - 1].lower() not in particles
        if start_new_name:
            names.append(" ".join(current))
            current = [word]
        else:
            current.append(word)

    if current:
        names.append(" ".join(current))

    return ", ".join(names)

# Apply to cast column
movies_data["cast"] = movies_data["cast"].apply(split_cast_names)


In [None]:
movies_data['cast'].head()

In [None]:
# 4. Normalize text for searching

def normalize_text(text):
    if not isinstance(text, str):
        return ""
    return text.lower().strip()

for col in ["cast", "genres", "director", "keywords"]:
    movies_data[col] = movies_data[col].apply(normalize_text)


In [None]:
# 5. Fetch & cache posters locally

TMDB_API_KEY = "d90f8c533ecbb3744c6a35fe20eeeacf"
TMDB_POSTER_BASE = "https://image.tmdb.org/t/p/w500"
POSTER_DIR = "posters"
os.makedirs(POSTER_DIR, exist_ok=True)

def cache_poster(movie_id):
    """
    Fetches poster from TMDB by ID and caches locally.
    Returns the local path.
    """
    poster_file = Path(POSTER_DIR) / f"{movie_id}.jpg"

    if poster_file.exists():
        return str(poster_file)

    try:
        url = f"https://api.themoviedb.org/3/movie/{int(movie_id)}"
        params = {"api_key": TMDB_API_KEY}
        r = requests.get(url, params=params, timeout=5)
        if r.status_code != 200:
            return None

        poster_path = r.json().get("poster_path")
        if not poster_path:
            return None

        poster_url = TMDB_POSTER_BASE + poster_path
        img = requests.get(poster_url, timeout=5).content

        with open(poster_file, "wb") as f:
            f.write(img)

        return str(poster_file)
    except Exception:
        return None

# Cache posters for all movies (do once)
movies_data["poster_local"] = movies_data["id"].apply(cache_poster)


In [None]:
# 6. Build the TF-IDF similarity matrix

selected_features = ['genres','keywords','tagline','cast','director']
for feature in selected_features:
    movies_data[feature] = movies_data[feature].fillna('')

combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']

vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)
similarity = cosine_similarity(feature_vectors)


In [None]:
# 7. Recommendation function

def recommend_movies(movie_name, df, similarity_matrix, top_n=12):
    """
    Returns a DataFrame of top_n movies similar to the given movie_name
    """
    movie_name = movie_name.lower()
    list_of_titles = df['title'].str.lower().tolist()

    close_match = difflib.get_close_matches(movie_name, list_of_titles, n=1)
    if not close_match:
        print("Movie not found!")
        return pd.DataFrame()
    close_match = close_match[0]

    index_of_movie = df[df['title'].str.lower() == close_match].index[0]
    similarity_score = list(enumerate(similarity_matrix[index_of_movie]))
    sorted_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

    recommended = []
    for i, (idx, score) in enumerate(sorted_movies):
        if i == 0:  # skip the input movie itself
            continue
        movie = df.iloc[[idx]]  # keep as DataFrame
        recommended.append(movie)
        if len(recommended) >= top_n:
            break

    if recommended:
        return pd.concat(recommended)
    else:
        return pd.DataFrame()


In [None]:
# 8. Show movies with posters (Netflix style)

from IPython.display import display, Image

def show_movies_with_posters(df):
    for _, row in df.iterrows():
        print(f"üé¨ {row['title']} | {row['genres']} | ‚≠ê {row['vote_average']} | ‚è± {row['runtime']} min")
        if isinstance(row["poster_local"], str):
            display(Image(filename=row["poster_local"], width=200))
        print(row["overview"])
        print("-"*80)
