In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array

2024-11-23 01:00:42.179273: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 01:00:42.195962: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8463] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 01:00:42.201144: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import requests

# poster_directory = '/home/wendy/temp/movie_rs/downloaded_images'  # Replace with the actual directory path

data = pd.read_csv('Datasets/TMDB_movie_dataset_v11.csv')

model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

def load_and_preprocess_image(image_path=None, url_path=None, target_size=(224, 224)):
    try:
        if image_path:
            # Load and preprocess the image from a file path
            img = Image.open(image_path).convert('RGB')
        elif url_path:
            # Load and preprocess the image from a URL
            response = requests.get(url_path, stream=True)
            response.raise_for_status()  # Raise an error for bad responses
            img = Image.open(response.raw).convert('RGB')
        else:
            print("No image_path or url_path provided.")
            return None
        
        # Resize and preprocess the image
        img = img.resize(target_size)
        img_array = img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        return preprocess_input(img_array)
    
    except Exception as e:
        print(f"Error processing image: {e}")
        return None

        


I0000 00:00:1732352450.831298 3642262 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732352450.831652 3642262 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732352450.831967 3642262 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1732352450.832284 3642262 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [7]:
def extract_features(starting, data, poster_directory, num_movies, TMDB_prefix="https://image.tmdb.org/t/p/w500"):
    features = []
    success_count = 0
    poster_col = data['poster_path']
    id_col = data['id']
    
    for row_idx in range(starting*10000, starting*10000 + 10000):
        movie_idx = id_col[row_idx]
        if not poster_directory:
            poster_path = os.path.join(poster_directory, f"image_{movie_idx}.jpeg")
        
        if poster_directory and os.path.exists(poster_path):
            preprocessed_image = load_and_preprocess_image(image_path=poster_path)
        else:
            if poster_col[row_idx] and poster_col[row_idx] != float('nan'):
                try:
                    url = TMDB_prefix + poster_col[row_idx]
                    preprocessed_image = load_and_preprocess_image(url_path=url)
                except Exception as e:
                    print(f"Error {poster_col[row_idx]=}: {e}")
                    
            else:
                preprocessed_image = None

        if preprocessed_image is not None:
            try:
                feature = model.predict(preprocessed_image)
                features.append(feature.flatten())
                success_count += 1  # Increment success counter
            except Exception as e:
                print(f"Error predicting features for index {row_idx}: {e}")
                features.append(None)
        else:
            features.append(None)
    
    print(f"Successfully processed {success_count}/{num_movies} images.")
    return np.array(features), success_count

def save_features(features, file_path):
    np.save(file_path, features)
    print(f"Features saved to {file_path}")



for i in range(2):
    features, success_count = extract_features(i, data, None, len(data))
    save_features(features, f"MovieFeatures/movie_features_{i}.npy")

def load_features(file_path):
    if os.path.exists(file_path):
        print(f"Loading features from {file_path}")
        return np.load(file_path, allow_pickle=True)
    else:
        print(f"No saved features found at {file_path}. Please extract features first.")
        return None
# Load extracted feature
# file_path = "movie_features.npy"
# features = load_features(file_path)
# print(len(features))

# if features is None:
#     features, success_count = extract_features(data['poster_path'], poster_directory, len(data))
#     save_features(features, file_path)



I0000 00:00:1732352699.677610 3717887 service.cc:146] XLA service 0x7048040012c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732352699.677645 3717887 service.cc:154]   StreamExecutor device (0): NVIDIA RTX A5000, Compute Capability 8.6
I0000 00:00:1732352699.677649 3717887 service.cc:154]   StreamExecutor device (1): NVIDIA RTX A5000, Compute Capability 8.6
I0000 00:00:1732352699.677653 3717887 service.cc:154]   StreamExecutor device (2): NVIDIA RTX A5000, Compute Capability 8.6
I0000 00:00:1732352699.677656 3717887 service.cc:154]   StreamExecutor device (3): NVIDIA RTX A5000, Compute Capability 8.6
2024-11-23 01:04:59.761608: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-23 01:05:00.075421: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


I0000 00:00:1732352701.317811 3717887 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19

In [5]:
# Handle missing features (e.g., due to failed image loading)
features = np.array([feat for feat in features if feat is not None])

# Compute cosine similarity
similarity_matrix = cosine_similarity(features)

# Function to recommend movies based on poster similarity
def recommend_movies(movie_index, top_k=20):
    similar_indices = np.argsort(similarity_matrix[movie_index])[::-1][1:top_k + 1]
    recommendations = data.iloc[similar_indices]
    sim_list = []
    for i in recommendations:
        sim_list.append(similarity_matrix[i])
    return recommendations, sim_list

# Example: Recommend movies for the first movie in the dataset
movie_index = 0
recommended_movies, sim_list = recommend_movies(movie_index)

print("Recommended movies:")
print(recommended_movies[['title']])

Recommended movies:
                                        title
945                                  Geostorm
2565                               Extinction
2674             Teenage Mutant Ninja Turtles
494           The Secret Life of Walter Mitty
9348  The Little Girl Who Lives Down the Lane
814                              Total Recall
658                                   Inferno
2167                           New Year's Eve
5924                         King of New York
415                                    Oldboy
4789                                Byzantium
3811                            The Forgotten
4137                                Black Box
2691                                Eden Lake
1453                                 The Raid
8902                             Arsène Lupin
6178                               Dark Water
2808                                  Skyline
1479                        Death on the Nile
1592                               Underwater
