# Real-time Movie Recommender System (Python-only, free)

This notebook builds an end-to-end **Python-only** recommender using the free MovieLens 100k dataset.

Features:
- Downloads MovieLens 100k (no paid services)
- Popular baseline, SVD (Surprise) collaborative model
- Content-based TF-IDF on titles + genres
- **Real-time** user profile updating (instant recommendations without retraining): user profile = weighted average of item embeddings (from SVD or TF-IDF)
- Optional minimal FastAPI example to serve recommendations locally

Run each cell in order. The notebook is intended to run locally (internet required for the dataset download).

In [2]:
# Install required packages. Run this cell once.
import sys, subprocess

packages = [
    "pandas",
    "numpy",
    "scipy",
    "scikit-learn",
    "scikit-surprise",     # âœ… correct name instead of surprise==1.1.3
    "implicit",
    "joblib",
    "tqdm",
    "fastapi",
    "uvicorn",
    "nbformat",
    "requests"
]

for p in packages:
    name = p.split("==")[0]
    try:
        __import__(name.replace("-", "_"))  # e.g., scikit-surprise -> scikit_surprise
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", p])

print("âœ… Dependencies installed (or were already present).")


âœ… Dependencies installed (or were already present).


In [4]:
# Robust MovieLens 100k downloader and loader (works even if files are in ml-100k/ml-100k/)
import os, zipfile, pandas as pd, requests

DATA_DIR = "data/ml-100k"
os.makedirs(DATA_DIR, exist_ok=True)
zip_url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
zip_path = os.path.join(DATA_DIR, "ml-100k.zip")

def find_ml_files(base_dir):
    """Find u.data and u.item files anywhere under base_dir."""
    udata_path = None
    uitem_path = None
    for root, _, files in os.walk(base_dir):
        if "u.data" in files:
            udata_path = os.path.join(root, "u.data")
        if "u.item" in files:
            uitem_path = os.path.join(root, "u.item")
        if udata_path and uitem_path:
            break
    return udata_path, uitem_path

udata, uitem = find_ml_files(DATA_DIR)

if not (udata and uitem):
    print("Downloading MovieLens 100k (~5MB)...")
    r = requests.get(zip_url, timeout=30)
    with open(zip_path, "wb") as f:
        f.write(r.content)
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(DATA_DIR)
    print("Extracted to", DATA_DIR)
    udata, uitem = find_ml_files(DATA_DIR)

if not (udata and uitem):
    raise FileNotFoundError(f"Could not find u.data or u.item in {DATA_DIR}. Please check the folder structure.")

print("Found:")
print(" -", udata)
print(" -", uitem)

# Load ratings and movies
ratings_cols = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(udata, sep="\\t", names=ratings_cols, encoding='latin-1')

movies_cols = ["movie_id", "title", "genres"]
movies_raw = pd.read_csv(uitem, sep="|", names=list(range(24)), encoding='latin-1', engine="python")
movies = movies_raw[[0, 1, 2]].copy()
movies.columns = movies_cols
movies['genres'] = movies['genres'].fillna('')

print("âœ… Ratings:", ratings.shape, "| Movies:", movies.shape)
ratings.head(), movies.head()


Found:
 - data/ml-100k/ml-100k/u.data
 - data/ml-100k/ml-100k/u.item


  ratings = pd.read_csv(udata, sep="\\t", names=ratings_cols, encoding='latin-1')


âœ… Ratings: (100000, 4) | Movies: (1682, 3)


(   user_id  movie_id  rating  timestamp
 0      196       242       3  881250949
 1      186       302       3  891717742
 2       22       377       1  878887116
 3      244        51       2  880606923
 4      166       346       1  886397596,
    movie_id              title       genres
 0         1   Toy Story (1995)  01-Jan-1995
 1         2   GoldenEye (1995)  01-Jan-1995
 2         3  Four Rooms (1995)  01-Jan-1995
 3         4  Get Shorty (1995)  01-Jan-1995
 4         5     Copycat (1995)  01-Jan-1995)

In [5]:
# Basic preprocessing: map ids to ints, build train/test (leave-last-out)
import numpy as np
from sklearn.model_selection import train_test_split

# Map ids to contiguous ints (optional but convenient)
unique_users = ratings['user_id'].unique()
unique_movies = ratings['movie_id'].unique()
user_map = {u:i for i,u in enumerate(unique_users)}
movie_map = {m:i for i,m in enumerate(unique_movies)}
ratings['user_idx'] = ratings['user_id'].map(user_map)
ratings['movie_idx'] = ratings['movie_id'].map(movie_map)

# Leave-last-out per user for evaluation (ranking): keep latest rating as test
ratings = ratings.sort_values(['user_idx', 'timestamp'])
test_rows = ratings.groupby('user_idx').tail(1).index
train = ratings.drop(index=test_rows).reset_index(drop=True)
test = ratings.loc[test_rows].reset_index(drop=True)

print("Train size:", train.shape, "Test size:", test.shape)

Train size: (99057, 6) Test size: (943, 6)


In [7]:
# Train Popular baseline and Matrix Factorization using TruncatedSVD (scikit-learn)
from collections import Counter
import joblib, os
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

# Popular baseline (by count)
pop_counts = train['movie_idx'].value_counts().sort_values(ascending=False)
popular_items = pop_counts.index.tolist()

# Build userâ€“item sparse matrix
user_item = coo_matrix(
    (train['rating'], (train['user_idx'], train['movie_idx'])),
    shape=(len(user_map), len(movie_map))
)

# Truncated SVD (Matrix Factorization)
svd = TruncatedSVD(n_components=50, random_state=42)
item_latent_aligned = svd.fit_transform(user_item.T)        # shape: n_items Ã— n_factors
item_latent_aligned = normalize(item_latent_aligned, axis=1) # normalize for cosine similarity

# Save model artifacts
os.makedirs("models", exist_ok=True)
joblib.dump({
    'popular': popular_items,
    'user_map': user_map,
    'movie_map': movie_map,
    'item_latent_aligned': item_latent_aligned
}, "models/mappings.pkl")

print("âœ… Trained TruncatedSVD and saved item embeddings. Shape:", item_latent_aligned.shape)


âœ… Trained TruncatedSVD and saved item embeddings. Shape: (1682, 50)


In [8]:
# Content-based: TF-IDF on title + genres
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

movies['text'] = movies['title'].fillna('') + ' ' + movies['genres'].fillna('')
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
item_tfidf = tfidf.fit_transform(movies['text'])  # shape: n_items x features

# Keep dense item vectors for quick similarity with user profile.
item_tfidf_dense = item_tfidf.toarray()
print("TF-IDF item vectors shape:", item_tfidf_dense.shape)

TF-IDF item vectors shape: (1682, 2330)


In [10]:
# Extract item latent factors from TruncatedSVD (scikit-learn)
# Works without Surprise, using the precomputed matrix from the previous step
import numpy as np
import joblib, os

# item_latent_aligned already computed by TruncatedSVD in the previous step
# Itâ€™s shaped (n_items, n_factors) â€” each row corresponds to a movie_idx

print("Item latent factors (TruncatedSVD) shape:", item_latent_aligned.shape)

# Optional: save for later steps or FastAPI serving
os.makedirs("models", exist_ok=True)
joblib.dump(item_latent_aligned, "models/item_latent_aligned.pkl")

# Ensure we have the same movie index alignment
assert item_latent_aligned.shape[0] == len(movie_map), "Item embedding count mismatch with movie map"
print("âœ… Aligned and saved item latent factors successfully.")


Item latent factors (TruncatedSVD) shape: (1682, 50)
âœ… Aligned and saved item latent factors successfully.


In [11]:
# Real-time recommender: user profile vector (no retraining) + nearest items by cosine similarity
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Normalize item embeddings
item_latent_norm = normalize(item_latent_aligned, axis=1)
item_tfidf_norm = normalize(item_tfidf_dense, axis=1)

def build_user_profile_from_interactions(interacted_item_indices, weights=None, embedding='svd'):
    # interacted_item_indices: list of movie_idx values that the user interacted with (positive feedback)
    # weights: optional list of same length with weights (e.g., recency or rating)
    # embedding: 'svd' or 'tfidf' or 'hybrid' (concatenate)
    if len(interacted_item_indices) == 0:
        return None
    if embedding == 'svd':
        vecs = item_latent_norm[interacted_item_indices]
    elif embedding == 'tfidf':
        vecs = item_tfidf_norm[interacted_item_indices]
    elif embedding == 'hybrid':
        vecs = np.hstack([item_latent_norm[interacted_item_indices], item_tfidf_norm[interacted_item_indices]])
        vecs = normalize(vecs, axis=1)
    else:
        raise ValueError('unknown embedding')
    if weights is None:
        profile = vecs.mean(axis=0)
    else:
        w = np.array(weights).reshape(-1,1)
        profile = (vecs * w).sum(axis=0) / w.sum()
    profile = profile.reshape(1, -1)
    profile = normalize(profile, axis=1)
    return profile  # shape (1, dim)

def recommend_for_profile(profile_vec, top_k=10, embedding='svd', exclude_indices=None):
    if embedding == 'svd':
        sims = cosine_similarity(profile_vec, item_latent_norm)[0]
    elif embedding == 'tfidf':
        sims = cosine_similarity(profile_vec, item_tfidf_norm)[0]
    elif embedding == 'hybrid':
        sims = cosine_similarity(profile_vec, np.hstack([item_latent_norm, item_tfidf_norm]))[0]
    else:
        raise ValueError('unknown embedding')
    # Exclude already seen items
    if exclude_indices is not None and len(exclude_indices)>0:
        sims[exclude_indices] = -1
    top_idx = sims.argsort()[::-1][:top_k]
    return top_idx, sims[top_idx]

# Example: simulate a new user who liked movie_idx [10, 50, 200]
sample_interactions = [10, 50, 200]
profile = build_user_profile_from_interactions(sample_interactions, embedding='hybrid')
rec_idx, scores = recommend_for_profile(profile, top_k=10, embedding='hybrid', exclude_indices=sample_interactions)
print('Recommended movie indices (internal):', rec_idx)
inv_movie_map = {v:k for k,v in movie_map.items()}
print('Recommendations (movie_id, title):')
for idx, sc in zip(rec_idx, scores):
    mid = inv_movie_map[idx]
    title = movies.loc[movies['movie_id']==mid, 'title'].values[0]
    print(mid, title, f'score={sc:.4f}')

Recommended movie indices (internal): [ 57  56  26  52  53  31   6 273 154  32]
Recommendations (movie_id, title):
423 E.T. the Extra-Terrestrial (1982) score=0.5883
143 Sound of Music, The (1965) score=0.5618
95 Aladdin (1992) score=0.5484
181 Return of the Jedi (1983) score=0.5444
196 Dead Poets Society (1989) score=0.5419
98 Silence of the Lambs, The (1991) score=0.5376
265 Hunt for Red October, The (1990) score=0.5193
197 Graduate, The (1967) score=0.5117
443 Birds, The (1963) score=0.5113
193 Right Stuff, The (1983) score=0.5069


In [12]:
# Save item embeddings to disk for serving (optional)
import numpy as np, joblib, os
os.makedirs('models', exist_ok=True)
np.save('models/item_latent_aligned.npy', item_latent_aligned)
np.save('models/item_tfidf.npy', item_tfidf_dense)
joblib.dump({'movie_id_to_idx': movie_map, 'idx_to_movie_id': {v:k for k,v in movie_map.items()}}, 'models/movie_maps.pkl')
print('Saved embeddings and maps to models/')

Saved embeddings and maps to models/


In [13]:
# Simple evaluation: Precision@K on leave-last-out test using profile-based ranking
import numpy as np
def precision_at_k_user(recommended_idxs, relevant_idxs, k=10):
    return len(set(recommended_idxs[:k]) & set(relevant_idxs)) / k

test_gt = test.groupby('user_idx')['movie_idx'].apply(list).to_dict()

precisions = []
for user_idx, gt_list in test_gt.items():
    user_train_items = train[train['user_idx']==user_idx]['movie_idx'].tolist()
    if len(user_train_items)==0:
        continue
    profile = build_user_profile_from_interactions(user_train_items, embedding='hybrid')
    rec_idx, _ = recommend_for_profile(profile, top_k=10, embedding='hybrid', exclude_indices=user_train_items)
    prec = precision_at_k_user(rec_idx, gt_list, k=10)
    precisions.append(prec)
print('Mean Precision@10 (hybrid profile):', np.mean(precisions))

Mean Precision@10 (hybrid profile): 0.010286320254506895


In [15]:
# ============================================================
# ðŸŽ¬ Colab-Compatible Streamlit Movie Recommender (with ngrok auth)
# ============================================================

# STEP 1 â€” Install dependencies
!pip install streamlit pyngrok -q

# STEP 2 â€” Add your ngrok auth token
!ngrok authtoken 32rWZvFnb6BC3GpnyvEdk0JJq7b_34DfL9ijFqPW9gnp8GGFV

# STEP 3 â€” Paste your Streamlit code into a .py file
app_code = r"""
import streamlit as st
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

# ------------------------------
# LOAD MODELS AND DATA
# ------------------------------
@st.cache_resource
def load_models():
    movie_maps = joblib.load("models/movie_maps.pkl")
    item_latent = np.load("models/item_latent_aligned.npy")
    item_tfidf = np.load("models/item_tfidf.npy")
    movies = pd.read_csv("data/ml-100k/ml-100k/u.item", sep="|", names=list(range(24)), encoding="latin-1")[[0,1,2]]
    movies.columns = ["movie_id", "title", "genres"]
    return movie_maps, item_latent, item_tfidf, movies

movie_maps, item_latent, item_tfidf, movies = load_models()

movie_id_to_idx = movie_maps["movie_id_to_idx"]
idx_to_movie_id = movie_maps["idx_to_movie_id"]

# Normalize embeddings
item_latent = normalize(item_latent, axis=1)
item_tfidf = normalize(item_tfidf, axis=1)

# ------------------------------
# FUNCTIONS
# ------------------------------
def build_user_profile(selected_titles, embedding="hybrid"):
    indices = [movie_id_to_idx[int(movies[movies["title"]==title]["movie_id"].values[0])]
               for title in selected_titles if title in movies["title"].values]
    if len(indices) == 0:
        return None
    if embedding == "svd":
        vecs = item_latent[indices]
    elif embedding == "tfidf":
        vecs = item_tfidf[indices]
    else:
        vecs = np.hstack([item_latent[indices], item_tfidf[indices]])
        vecs = normalize(vecs, axis=1)
    profile = vecs.mean(axis=0).reshape(1, -1)
    profile = normalize(profile, axis=1)
    return profile

def recommend(profile, embedding="hybrid", top_k=10, exclude=None):
    if embedding == "svd":
        sims = cosine_similarity(profile, item_latent)[0]
    elif embedding == "tfidf":
        sims = cosine_similarity(profile, item_tfidf)[0]
    else:
        sims = cosine_similarity(profile, np.hstack([item_latent, item_tfidf]))[0]
    if exclude:
        for idx in exclude:
            sims[idx] = -1
    top_idx = sims.argsort()[::-1][:top_k]
    recs = []
    for i in top_idx:
        mid = idx_to_movie_id[i]
        title = movies.loc[movies["movie_id"]==mid, "title"].values[0]
        recs.append((title, sims[i]))
    return recs

# ------------------------------
# STREAMLIT UI
# ------------------------------
st.set_page_config(page_title="Movie Recommender", page_icon="ðŸŽ¬")
st.title("ðŸŽ¬ Real-Time Movie Recommender (Free, Local ML)")
st.markdown("Select movies you like â€” the system will recommend similar ones instantly!")

user_movies = st.multiselect(
    "ðŸŽ¥ Pick a few movies you liked:",
    movies["title"].tolist(),
    default=["Toy Story (1995)", "Pulp Fiction (1994)"]
)

embedding_choice = st.radio("Model type:", ["hybrid", "svd", "tfidf"], horizontal=True)

if st.button("ðŸš€ Get Recommendations"):
    profile = build_user_profile(user_movies, embedding=embedding_choice)
    if profile is None:
        st.warning("Please select at least one valid movie.")
    else:
        exclude = [movie_id_to_idx[int(movies[movies['title']==t]['movie_id'].values[0])] for t in user_movies]
        recs = recommend(profile, embedding=embedding_choice, exclude=exclude)
        st.subheader("ðŸŽ¯ Top Recommendations")
        for title, score in recs:
            st.write(f"**{title}** â€” similarity: `{score:.4f}`")
"""

with open("app.py", "w") as f:
    f.write(app_code)

# STEP 4 â€” Start Streamlit via ngrok tunnel
from pyngrok import ngrok
import threading, time

def run_app():
    !streamlit run app.py --server.port 8501 > /dev/null 2>&1

thread = threading.Thread(target=run_app)
thread.start()

# Wait and open tunnel
time.sleep(5)
public_url = ngrok.connect(8501)
print("âœ… Streamlit app is live at:", public_url)


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
âœ… Streamlit app is live at: NgrokTunnel: "https://ebbe6f8c5da3.ngrok-free.app" -> "http://localhost:8501"
