## Required Libraries and Dependancies

In [40]:
# We try to import first; if something is missing we install it with %pip

missing = []

def _try_import(name, pip_name = None):
    try:
        __import__(name)
    except ModuleNotFoundError:
        missing.append(pip_name or name)

_try_import("pandas")
_try_import("numpy")
_try_import("sklearn", "scikit-learn")
_try_import("requests")
_try_import("gradio")

if missing:
    print("Installing:", missing)
    from IPython import get_ipython
    get_ipython().run_line_magic("pip", "install package_name")


import pandas as pd
import numpy as np
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import difflib
import gradio as gr

pd.set_option("display.max_colwidth", 120)
print("✅ Imports ready")

✅ Imports ready


### Put your TMDb credentials here (or use the tiny sample)

In [41]:
# Preferred: TMDb v4 Read Access Token (Bearer)
TMDB_BEARER = ""  # e.g., "eyJhbGciOiJIUzI1NiJ9..."

# Alternative: TMDb v3 API key
TMDB_API_KEY = "755649622c39b56880c6714546a702e5"  # e.g., "1234567890abcdef..."

# If you don't have a key yet, set this to True to use a tiny built-in sample so you can still run the notebook
USE_SAMPLE_IF_NO_KEY = True

TMDB_API_URL = "https://api.themoviedb.org/3"

def tmdb_headers_params():
    """
    Build (headers, params) for TMDb requests:
    - If TMDB_BEARER is set, send 'Authorization: Bearer …'
    - Else if TMDB_API_KEY is set, send ?api_key=…
    - Else, if USE_SAMPLE_IF_NO_KEY=True, we will fall back to a tiny sample dataset.
    """
    headers = {}
    params = {}
    if TMDB_BEARER.strip():
        headers["Authorization"] = f"Bearer {TMDB_BEARER.strip()}"
    elif TMDB_API_KEY.strip():
        params["api_key"] = TMDB_API_KEY.strip()
    else:
        if USE_SAMPLE_IF_NO_KEY:
            print("⚠️ No TMDb credentials found. Will use the built-in sample dataset instead.")
        else:
            raise RuntimeError("Please provide TMDB_BEARER or TMDB_API_KEY above.")
    return headers, params


### Fetch genres + movies from TMDb (discover)

In [42]:
def fetch_tmdb_genres():
    """Fetch a dict {genre_id: genre_name} so we can turn IDs into names."""
    headers, params = tmdb_headers_params()
    url = f"{TMDB_API_URL}/genre/movie/list"
    r = requests.get(url, headers = headers, params = params, timeout = 20)
    r.raise_for_status()
    data = r.json()
    return{g["id"]: g["name"] for g in data.get("genres", [])}


def fetch_tmdb_movies(pages: int = 20):
    """
    Pull ~pages*20 popular movies via /discover/movie and return a DataFrame with:
    title | genres (pipe-separated) | overview | year
    """
    headers, params = tmdb_headers_params()
    genre_map = fetch_tmdb_genres()

    rows = []
    for page in range(1, pages+1):
        url = f"{TMDB_API_URL}/discover/movie"
        q = dict(params)
        q.update({
            "sort_by": "popularity.desc",
            "page": page,
            "include_adult": "false",
            "language": "en-US",
        })
        r = requests.get(url, headers = headers, params = q, timeout = 20)
        r.raise_for_status()
        payload = r.json()
        for i in payload.get("results", []):
            title = i.get("title") or i.get("original_title") or ""
            overview = i.get("overview") or ""
            genre_ids = i.get("genre_ids") or []
            gnames = [genre_map.get(gid, str(gid)) for gid in genre_ids]
            genres = "|".join([g for g in gnames if g])
            rd = (i.get("release_date") or "")
            year = int(rd[:4]) if len(rd) >= 4 and rd[:4].isdigit() else None
            rows.append({"title": title, "genres": genres, "overview": overview, "year": year})

    df = pd.DataFrame(rows).dropna(subset = ["title"]).drop_duplicates(subset = ["title"]).reset_index(drop = True)
    return df

### Tiny built-in sample (only if no key)

In [43]:
SAMPLE_CSV = """title,genres,overview,year
The Matrix,Action|Sci-Fi,A hacker discovers reality is a simulation and joins a rebellion.,1999
Inception,Action|Sci-Fi|Thriller,A thief enters people\'s dreams to steal corporate secrets.,2010
Interstellar,Adventure|Drama|Sci-Fi,Explorers travel through a wormhole to save humanity.,2014
The Dark Knight,Action|Crime|Drama,Batman faces the Joker\'s chaos in Gotham.,2008
Titanic,Drama|Romance,A love story unfolds aboard the ill-fated RMS Titanic.,1997
The Godfather,Crime|Drama,The aging patriarch of an organized crime dynasty transfers control to his reluctant son.,1972
Pulp Fiction,Crime|Drama,Interwoven stories of crime and redemption in Los Angeles.,1994
Forrest Gump,Drama|Romance,The life journey of a gentle man who witnesses key moments in history.,1994
The Shawshank Redemption,Drama,Two imprisoned men bond over years, finding solace and eventual freedom.,1994
Arrival,Drama|Mystery|Sci-Fi,A linguist communicates with aliens to understand their purpose.,2016
"""


def load_dataset(pages = 20):
    """
    Decide whether to load from TMDb or the sample.
    """
    has_creds = bool(TMDB_BEARER.strip() or TMDB_API_KEY.strip())
    if has_creds:
        df = fetch_tmdb_movies(pages=pages)
        if df.empty:
            raise RuntimeError("TMDb returned no results — try increasing 'pages' or check your credentials.")
        print(f"✅ Loaded {len(df)} movies from TMDb")
        return df
    else:
        if USE_SAMPLE_IF_NO_KEY:
            from io import StringIO
            df = pd.read_csv(StringIO(SAMPLE_CSV))
            print(f"✅ Using sample dataset with {len(df)} movies")
            return df
        else:
            raise RuntimeError("No credentials and sample fallback disabled. Please set TMDB_BEARER or TMDB_API_KEY.")

### Build features + recommender (TF-IDF + cosine NN)

In [46]:
def combine_text_fields(df: pd.DataFrame) -> pd.Series:
    """
    Combine title + genres + year + overview into one text field.
    WHY:
      - A simple "bag of words" works well for content similarity.
      - We replace '|' with spaces so genre tokens are seen individually.
      - Lowercasing makes tokenization consistent.
    """
    genres = df.get("genres", pd.Series([""] * len(df))).fillna("").str.replace("|", " ", regex=False)
    title = df.get("title", pd.Series([""] * len(df))).fillna("")
    year = df.get("year", pd.Series([""] * len(df))).astype(str).replace("nan", "")
    overview = df.get("overview", pd.Series([""] * len(df))).fillna("")
    return (title + " " + genres + " " + year + " " + overview).str.lower()


class MovieRecommender:
    """
    Minimal, readable content-based recommender using TF-IDF + NearestNeighbors (cosine).
    - TF-IDF: turns text into numerical vectors; bigrams help capture short phrases.
    - NearestNeighbors (cosine): finds items with similar direction in vector space.
    """
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), min_df=1)
        self.nn = NearestNeighbors(metric="cosine", algorithm="brute")
        self.df = None
        self.X = None
        self.title_to_idx = {}

    def fit(self, df: pd.DataFrame):
        self.df = df.copy()
        self.df["__text__"] = combine_text_fields(self.df)
        self.X = self.vectorizer.fit_transform(self.df["__text__"])
        self.nn.fit(self.X)
        # map lowercase title -> row index (for robust lookup)
        self.title_to_idx = {t.lower(): i for i, t in enumerate(self.df["title"].fillna("").tolist())}
        return self

    def _resolve_title(self, query: str, cutoff: float = 0.6):
        """
        fuzzy-match the user's title to our catalog; return (best_match, suggestions)
        WHY: helps with typos: 'The Matrxi' -> 'The Matrix'
        """
        titles = self.df["title"].fillna("").tolist()
        matches = difflib.get_close_matches(query.lower(), [t.lower() for t in titles], n=5, cutoff=cutoff)
        if not matches:
            return None, []
        original = {t.lower(): t for t in titles}
        suggestions = [original[i] for i in matches]
        return suggestions[0], suggestions

    def recommend(self, title: str, k: int = 10, genre_filter=None) -> pd.DataFrame:
        """
        Get top-k similar movies (optionally restrict to specific genres).
        """
        if self.df is None:
            raise RuntimeError("Model not fitted. Call .fit(df) first.")

        # Note: Original code had a typo here "_resolve_tile". Corrected to "_resolve_title".
        best, suggestions = self._resolve_title(title)
        if best is None:
            # return a friendly, debuggable message instead of raising
            return pd.DataFrame({"message": ["No close title matches. Suggestions:"] + suggestions})

        idx = self.title_to_idx[best.lower()]
        x = self.X[idx]

        # cosine "distance" returned by NN; similarity = 1 - distance
        distances, indices = self.nn.kneighbors(x, n_neighbors=min(k + 1, self.X.shape[0]))
        distances = distances.ravel()
        indices = indices.ravel()

        rows = []
        for d, i in zip(distances, indices):
            if i == idx:  # skip the item itself
                continue
            row = self.df.iloc[i].copy()
            row["similarity"] = 1 - float(d)
            rows.append(row)

        out = pd.DataFrame(rows)

        if genre_filter:
            def has_any(g):
                tokens = set(str(g).split("|"))
                return any(gx in tokens for gx in genre_filter)
            out = out[out["genres"].apply(has_any)]

        return out.sort_values("similarity", ascending=False).head(k).reset_index(drop=True)

### Load data + fit model

In [47]:
PAGES = 20    # ~20 movies per page
df = load_dataset(pages = PAGES)
reco = MovieRecommender().fit(df)

print("Catalog Sample:")
display(df.head(5))

✅ Loaded 395 movies from TMDb
Catalog Sample:


Unnamed: 0,title,genres,overview,year
0,War of the Worlds,Science Fiction|Thriller,Will Radford is a top analyst for Homeland Security who tracks potential threats through a mass surveillance program...,2025.0
1,F1,Action|Drama,Racing legend Sonny Hayes is coaxed out of retirement to lead a struggling Formula 1 team—and mentor a young hotshot...,2025.0
2,Superman,Science Fiction|Adventure|Action,"Superman, a journalist in Metropolis, embarks on a journey to reconcile his Kryptonian heritage with his human upbri...",2025.0
3,Mission: Impossible - The Final Reckoning,Action|Adventure|Thriller,Ethan Hunt and team continue their search for the terrifying AI known as the Entity — which has infiltrated intellig...,2025.0
4,Together,Horror|Romance,"With a move to the countryside already testing the limits of a couple's relationship, a supernatural encounter begin...",2025.0


### Quick test (intentional typo to show fuzzy match)

In [48]:
test_title = "The Matrxi"  # typo on purpose
results = reco.recommend(test_title, k=5)
results.head(10)

Unnamed: 0,title,genres,overview,year,__text__,similarity
0,Avatar,Action|Adventure|Fantasy|Science Fiction,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn bet...",2009.0,"avatar action adventure fantasy science fiction 2009.0 in the 22nd century, a paraplegic marine is dispatched to the...",0.087026
1,Titanic,Drama|Romance,"101-year-old Rose DeWitt Bukater tells the story of her life aboard the Titanic, 84 years later. A young Rose boards...",1997.0,"titanic drama romance 1997.0 101-year-old rose dewitt bukater tells the story of her life aboard the titanic, 84 yea...",0.068919
2,Mad Max: Fury Road,Action|Adventure|Science Fiction,"An apocalyptic story set in the furthest reaches of our planet, in a stark desert landscape where humanity is broken...",2015.0,mad max: fury road action adventure science fiction 2015.0 an apocalyptic story set in the furthest reaches of our p...,0.048204
3,Interstellar,Adventure|Drama|Science Fiction,The adventures of a group of explorers who make use of a newly discovered wormhole to surpass the limitations on hum...,2014.0,interstellar adventure drama science fiction 2014.0 the adventures of a group of explorers who make use of a newly d...,0.04407
4,Dracula: A Love Tale,Horror|Fantasy|Romance,The story of 15th century Prince Vladimir who curses God following the death of his beloved wife and is turned into ...,2025.0,dracula: a love tale horror fantasy romance 2025.0 the story of 15th century prince vladimir who curses god followin...,0.040412


### In-notebook web app (Gradio)

In [50]:
# Prepare genre options for the UI
all_genres = sorted({g for row in df["genres"].fillna("") for g in str(row).split("|") if g})

def ui_recommend(title, selected_genres, k):
    if not title:
        return pd.DataFrame({"message": ["Type a movie title to get recommendations."]})
    genre_filter = selected_genres if selected_genres else None
    out = reco.recommend(title, k = int(k), genre_filter = genre_filter)
    # Return a neat table
    return out[["title", "genres", "year", "similarity", "overview"]] if "message" not in out.columns else out


with gr.Blocks(title = "Movie Recommendation System") as demo:
    gr.Markdown("## 🎬 Movie Recommendation System (TMDb + TF-IDF)")
    with gr.Row():
        title_in = gr.Textbox(label="Movie title", placeholder="e.g., The Matrix")
        k_in = gr.Slider(3, 15, value=7, step=1, label="How many recommendations?")
    genres_in = gr.CheckboxGroup(all_genres, label="Restrict to genres (optional)")
    btn = gr.Button("Recommend")
    out = gr.Dataframe(headers=["title","genres","year","similarity","overview"], wrap=True)

    btn.click(fn=ui_recommend, inputs=[title_in, genres_in, k_in], outputs=out)


# Launch right inside the notebook
demo.queue().launch(inline = True, share = False, debug = False)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


