In [1]:
import torch
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import pandas as pd
import numpy as np
import requests
from io import BytesIO
import os


In [18]:
movieapp = pd.read_csv("/home/pokji/vscode-projects/uni/information_search/data/movieapp_movie.csv")
movieapp.head()

Unnamed: 0,id,movie_id,languages,releaseDate,directors,runtime,title,mpaa,actors,originalTitle,genres,plotSummary,avgRating,releaseYear,poster_path
0,45844,1,English,1995.0,John Lasseter,81,Toy Story,G,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",Toy Story,"Animation, Comedy, Family","Led by Woody, Andy's toys live happily in his ...",3.9,1995.0,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,45845,2,"English, Français",1995.0,Joe Johnston,104,Jumanji,PG,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Jumanji,"Adventure, Fantasy, Family",When siblings Judy and Peter discover an encha...,3.2,1995.0,/vgpXmVaVyUL7GGiDeiK1mKEKzcX.jpg
2,45846,3,English,1995.0,Howard Deutch,101,Grumpier Old Men,PG-13,"Walter Matthau, Jack Lemmon, Ann-Margret, Soph...",Grumpier Old Men,"Romance, Comedy",A family wedding reignites the ancient feud be...,3.2,1995.0,/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg
3,45847,4,English,1995.0,Forest Whitaker,127,Waiting to Exhale,R,"Whitney Houston, Angela Bassett, Loretta Devin...",Waiting to Exhale,"Comedy, Drama, Romance","Cheated on, mistreated and stepped on, the wom...",2.9,1995.0,/qJU6rfil5xLVb5HpJsmmfeSK254.jpg
4,45848,5,English,1995.0,Charles Shyer,106,Father of the Bride Part II,PG,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...,3.1,1995.0,/rj4LBtwQ0uGrpBnCELr716Qo3mw.jpg


In [2]:
movies = pd.read_csv('/home/pokji/vscode-projects/uni/information_search/data/movieapp_movie.csv')
movies.head(2)

Unnamed: 0,id,movie_id,languages,releaseDate,directors,runtime,title,mpaa,actors,originalTitle,genres,plotSummary,avgRating,releaseYear,poster_path
0,45844,1,English,1995.0,John Lasseter,81,Toy Story,G,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",Toy Story,"Animation, Comedy, Family","Led by Woody, Andy's toys live happily in his ...",3.9,1995.0,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,45845,2,"English, Français",1995.0,Joe Johnston,104,Jumanji,PG,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Jumanji,"Adventure, Fantasy, Family",When siblings Judy and Peter discover an encha...,3.2,1995.0,/vgpXmVaVyUL7GGiDeiK1mKEKzcX.jpg


In [5]:
posters_df = pd.read_csv('/home/pokji/vscode-projects/uni/information_search/data_old/local_path_posters.csv')

In [6]:
movies[['movie_id']]

Unnamed: 0,movie_id
0,1
1,2
2,3
3,4
4,5
...,...
45838,176269
45839,176271
45840,176273
45841,176275


In [7]:
posters_df[['movie_id']]

Unnamed: 0,movie_id
0,1
1,2
2,3
3,4
4,5
...,...
45838,176269
45839,176271
45840,176273
45841,176275


In [8]:
# Drop NaN and ensure genres is a list
movies['genres'] = movies['genres'].fillna('')
movies['genres'] = movies['genres'].apply(lambda x: x.split(','))

In [9]:
# Get boolean mask for existing poster files
exists_mask = posters_df['local_poster_path'].apply(lambda path: isinstance(path, str) and os.path.isfile(path))

# Get indices of rows with proper posters
valid_indices = posters_df[exists_mask].index.tolist()

print(f"Number of valid poster files: {len(valid_indices)}")

# Valid Dataset
valid_posters_df = posters_df[exists_mask].reset_index(drop=True)

Number of valid poster files: 43527


In [31]:
valid_posters_df[['movie_id']].to_csv('valid_posters_index_movieid.csv', index=True)


### Load the CLIP embeddings

In [5]:
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLIP model
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

In [8]:
from tqdm import tqdm

def get_clip_embeddings_batch(image_paths, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(image_paths), batch_size), desc="Embedding posters"):
        batch_paths = image_paths[i:i+batch_size]
        images = []
        for path in batch_paths:
            try:
                # print(path)
                images.append(Image.open(path).convert("RGB"))
            except Exception:
                print(path)
                images.append(Image.new("RGB", (224, 224)))  # fallback blank image
        inputs = clip_processor(images=images, return_tensors="pt", padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            batch_emb = clip_model.get_image_features(**inputs)
        embeddings.extend(batch_emb.cpu().numpy())
    return np.array(embeddings)


embeddings = get_clip_embeddings_batch(valid_posters_df['local_poster_path'].tolist(), batch_size=128)

Embedding posters: 100%|██████████| 341/341 [1:02:14<00:00, 10.95s/it]


In [12]:
embeddings = np.load("clip_poster_embeddings.npy")

len(embeddings)

43527

In [13]:
# Add embeddings column for every movie with valid poster
valid_posters_df['embedding'] = list(embeddings)

In [11]:
valid_posters_df.head(2)

Unnamed: 0,id,movie_id,title,local_poster_path
0,45844,1,Toy Story,posters/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,45845,2,Jumanji,posters/vgpXmVaVyUL7GGiDeiK1mKEKzcX.jpg


In [14]:
movies_exploded = movies.explode('genres')
posters_df


Unnamed: 0,id,movie_id,title,local_poster_path
0,45844,1,Toy Story,posters/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,45845,2,Jumanji,posters/vgpXmVaVyUL7GGiDeiK1mKEKzcX.jpg
2,45846,3,Grumpier Old Men,posters/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg
3,45847,4,Waiting to Exhale,posters/qJU6rfil5xLVb5HpJsmmfeSK254.jpg
4,45848,5,Father of the Bride Part II,posters/rj4LBtwQ0uGrpBnCELr716Qo3mw.jpg
...,...,...,...,...
45838,91682,176269,Subdued,posters/uXDfjJbdP4ijW5hWSBrPrlKpxac.jpg
45839,91683,176271,Century of Birthing,posters/5RuNHleRzOfbshJJT4QVFmxC5su.jpg
45840,91684,176273,Betrayal,posters/63pHI5M1og4pc7CdGKpAW0Fntxf.jpg
45841,91685,176275,Satan Triumphant,posters/kmPJ4iJn3gtZyWCxCiuggl2kAuu.jpg


In [34]:
movies[movies.title == "Forrest Gump"]

Unnamed: 0,id,movie_id,languages,releaseDate,directors,runtime,title,mpaa,actors,originalTitle,genres,plotSummary,avgRating,releaseYear,poster_path
352,46196,356,English,1994.0,Robert Zemeckis,142,Forrest Gump,PG-13,"Tom Hanks, Robin Wright, Gary Sinise, Mykelti ...",Forrest Gump,"[Comedy, Drama, Romance]",A man with a low IQ has accomplished great thi...,4.1,1994.0,/arw2vcBveWOVZr6pxd9XTd1TdQa.jpg


In [15]:
movies_exploded = movies.explode('genres')

movies_with_posters = pd.merge(
    movies_exploded,
    valid_posters_df[['movie_id', 'local_poster_path', 'embedding']],
    left_on='movie_id',  # adjust if your movie ID column is named differently
    right_on='movie_id'
)

In [42]:
movies_with_posters.head(2)

Unnamed: 0,id,movie_id,languages,releaseDate,directors,runtime,title,mpaa,actors,originalTitle,genres,plotSummary,avgRating,releaseYear,poster_path,local_poster_path,embedding
0,45844,1,English,1995.0,John Lasseter,81,Toy Story,G,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",Toy Story,Animation,"Led by Woody, Andy's toys live happily in his ...",3.9,1995.0,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,posters/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,"[-0.2138414, -0.34096324, 0.33010274, 0.062374..."
1,45844,1,English,1995.0,John Lasseter,81,Toy Story,G,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",Toy Story,Comedy,"Led by Woody, Andy's toys live happily in his ...",3.9,1995.0,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,posters/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,"[-0.2138414, -0.34096324, 0.33010274, 0.062374..."


In [17]:
def recommend_by_poster(movie_id, top_n=5):
    
    target_rows = movies_with_posters[movies_with_posters['movie_id'] == movie_id]
    if target_rows.empty:
        print(f"No movie found with ID {movie_id}.")
        return None
    
    # Get all unique genres for the target movie
    target_genres = set(target_rows['genres'])

    # Use the first embedding
    target_emb = target_rows.iloc[0]['embedding'].reshape(1, -1)

    candidates = movies_with_posters[
        (movies_with_posters['movie_id'] != movie_id) &
        (movies_with_posters['genres'].isin(target_genres))
    ].copy()

    # Drop the duplicated movies from exploded df
    candidates = candidates.drop_duplicates('movie_id')


    if candidates.empty:
        print("No other movies with matching genres.")
        return None
    
    # Compute cosine similarity
    embds = np.stack(candidates['embedding'].values)
    sims = cosine_similarity(target_emb, embds)[0]
    
    candidates['similarity'] = sims

    recommendations = candidates.sort_values('similarity', ascending=False).head(top_n)
    return recommendations[['movie_id', 'title', 'genres', 'similarity', 'local_poster_path']]


recommendations = recommend_by_poster(movie_id=1)  
recommendations


Unnamed: 0,movie_id,title,genres,similarity,local_poster_path
7065,3114,Toy Story 2,Animation,0.902331,posters/yFWQkz2ynjwsazT6xQiIXEUsyuh.jpg
35227,78499,Toy Story 3,Animation,0.876064,posters/AbbXspMOwdvwWZgVN0nabZq03Ec.jpg
47772,106022,Toy Story of Terror!,Comedy,0.871648,posters/oPBEnNP4Fg4gv9c0KBhchmtoG4H.jpg
33350,72701,Planet 51,Family,0.860752,posters/x7Itcg3ZdExKwdKguy73WPEqosW.jpg
54899,120474,Toy Story That Time Forgot,Animation,0.860189,posters/kVOaFo8RaYQceHfTqbZGmV9R80q.jpg
