In [37]:
!pip install -qU \
  pinecone-client==3.1.0

!pip install datasets surprise

from datasets import load_dataset
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import cross_validate
from pinecone import Pinecone
import numpy as np
import os
from IPython.core.display import HTML
from pinecone import ServerlessSpec



In [38]:
# Load dataset
movies = load_dataset("pinecone/movielens-recent-ratings", split="train").to_pandas()

# Drop duplicates to get unique movies
unique_movies = movies.drop_duplicates(subset="imdb_id")

reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(movies[['user_id', 'movie_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

# Use SVD (Singular Value Decomposition) for Collaborative Filtering
model = SVD()
model.fit(trainset)

predictions = model.test(testset)
print(f"RMSE: {accuracy.rmse(predictions)}")

RMSE: 0.8368
RMSE: 0.836785213431494


In [39]:
api_key = 'pcsk_3fKTan_ESTW4A32S4NTwEfiRGuHerLicL2vKRnJvXxDZhjMP9iWqG3iZ5GMg6wXB8QbtJK'
pc = Pinecone(api_key=api_key)

cloud = 'aws'
region = 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

index_name = 'movie-emb'

# Delete the index, if of the same name already exists
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

import time

dimensions = 1
pc.create_index(
    name=index_name,
    dimension=dimensions,
    metric="cosine",
    spec=spec
)

while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

# Store movie embeddings (predicted ratings)
batch_size = 64
for i in range(0, len(unique_movies), batch_size):
    batch = unique_movies.iloc[i:i+batch_size]
    embeddings = np.array([model.predict(uid=0, iid=mid).est for mid in batch['movie_id']]).reshape(-1, 1).tolist()
    metadata = batch.to_dict(orient='records')
    ids = batch["imdb_id"].values.tolist()
    to_upsert = list(zip(ids, embeddings, metadata))
    index.upsert(vectors=to_upsert)

In [42]:
movie_titles = dict(zip(unique_movies['movie_id'], unique_movies['title']))

def get_movie_id(movie_name):
    """Find the movie ID for a given movie title"""
    movie_row = unique_movies[unique_movies['title'].str.contains(movie_name, case=False, na=False)]

    if movie_row.empty:
        print("Movie not found! Please check the name and try again.")
        return None, None

    return movie_row.iloc[0]['movie_id'], movie_row.iloc[0]['poster']

def get_recommendations(movie_name, top_n=10):
    """Get similar movies using SVD Collaborative Filtering and return their posters"""
    movie_id, input_movie_poster = get_movie_id(movie_name)

    if movie_id is None:
        return None, [], []

    # Get users who rated this movie
    users_who_rated = movies[movies['movie_id'] == movie_id]['user_id'].unique()

    predictions = []
    for user in users_who_rated[:500]:  # Limit to 500 users for efficiency
        for other_movie_id in movies['movie_id'].unique():
            if other_movie_id != movie_id:  # Don't recommend the same movie
                pred = model.predict(user, other_movie_id)
                predictions.append((other_movie_id, pred.est))

    # Sort predictions by highest estimated rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    recommended_movies = []
    posters = []

    for movie_id, _ in predictions[:top_n]:
        if movie_id in movie_metadata:
            recommended_movies.append(movie_metadata[movie_id]["title"])
            posters.append(movie_metadata[movie_id]["poster"])

    return input_movie_poster, recommended_movies, posters

def display_posters(input_movie_poster, posters, titles, input_movie_name):
    """Displays the input movie poster above recommended movies in a grid"""
    html = f'''
        <div style="text-align: center; margin-bottom: 20px;">
            <h3>Movie You Searched: {input_movie_name}</h3>
            <img src="{input_movie_poster}" style="width: 150px; height: 200px; border-radius: 10px;">
        </div>
    '''

    figures = []
    for title, poster in zip(titles, posters):
        figures.append(f'''
            <figure style="margin: 5px !important; text-align: center;">
              <img src="{poster}" style="width: 120px; height: 150px; border-radius: 10px;">
              <figcaption style="font-size: 12px;">{title}</figcaption>
            </figure>
        ''')

    html += f'''
        <h3>Recommended Movies</h3>
        <div style="display: flex; flex-flow: row wrap; text-align: center;">
        {''.join(figures)}
        </div>
    '''

    display(HTML(html))

# Get movie name from user input
movie_name = input("Enter the movie name: ")
input_movie_poster, recommended_movies, posters = get_recommendations(movie_name)

if recommended_movies:
    display_posters(input_movie_poster, posters, recommended_movies, movie_name)
else:
    print("No recommendations found.")

Enter the movie name: incredibles
