In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
imdb_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/TMDB_movie_dataset_v11.csv")
spotify_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/spotify_dataset.csv")

# Data cleaning

In [5]:
print(imdb_data.shape)

duplicate_rows = imdb_data[imdb_data.duplicated()]
print(duplicate_rows.shape)

#remove duplicates
df_cleaned = imdb_data.drop_duplicates()
print(df_cleaned.shape)

#clean null in critial rows used in mapping later: titles, overview, poster_path, popularity, genres
critical_columns = ['title', 'overview', 'poster_path', 'popularity', 'genres']
null_title_rows = df_cleaned[df_cleaned[critical_columns].isnull().any(axis=1)]
print(null_title_rows.shape)
df_cleaned_titles = df_cleaned.dropna(subset=critical_columns)
print(df_cleaned_titles.shape)

#only released and without adult content
df_filtered = df_cleaned_titles[(df_cleaned_titles['status'] == 'Released') & (df_cleaned_titles['adult'] == False)]
movie_final_df = df_filtered.drop(columns=['imdb_id', 'homepage', 'status', 'adult', 'backdrop_path', 'release_date', 'tagline', 'production_companies', 'production_countries', 'spoken_languages', 'keywords'])
print(movie_final_df.shape)

(1111181, 24)
(366, 24)
(1110815, 24)
(657733, 24)
(453082, 24)
(436591, 13)


In [6]:
movie_final_df.head()

Unnamed: 0,id,title,vote_average,vote_count,revenue,runtime,budget,original_language,original_title,overview,popularity,poster_path,genres
0,27205,Inception,8.364,34495,825532764,148,160000000,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,"Action, Science Fiction, Adventure"
1,157336,Interstellar,8.417,32571,701729206,169,165000000,en,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,"Adventure, Drama, Science Fiction"
2,155,The Dark Knight,8.512,30619,1004558444,152,185000000,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,"Drama, Action, Crime, Thriller"
3,19995,Avatar,7.573,29815,2923706026,162,237000000,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,"Action, Adventure, Fantasy, Science Fiction"
4,24428,The Avengers,7.71,29166,1518815515,143,220000000,en,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,"Science Fiction, Action, Adventure"


In [7]:
movie_final_df.isna().sum()

Unnamed: 0,0
id,0
title,0
vote_average,0
vote_count,0
revenue,0
runtime,0
budget,0
original_language,0
original_title,0
overview,0


In [8]:
spotify_data.shape

(114000, 21)

In [9]:
spotify_data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


# Preprocessing & Merging

In [10]:
spotify_sample = spotify_data.head(50)
movie_sample = movie_final_df.head(100)

In [11]:
from IPython.display import Image, display

# Base URL for TMDb images
BASE_IMAGE_URL = "https://image.tmdb.org/t/p/w500"

# Add full poster URLs
movie_sample['poster_url'] = BASE_IMAGE_URL + movie_sample['poster_path'].astype(str)

# Display poster images
for index, row in movie_sample.head(5).iterrows():
    print(f"Title: {row['title']}")
    display(Image(url=row['poster_url']))

Title: Inception


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_sample['poster_url'] = BASE_IMAGE_URL + movie_sample['poster_path'].astype(str)


Title: Interstellar


Title: The Dark Knight


Title: Avatar


Title: The Avengers


In [12]:
import os

# Directory to save downloaded images
POSTER_DIR = "/content/drive/MyDrive/Colab Notebooks/Posters"
os.makedirs(POSTER_DIR, exist_ok=True)  # Create the directory if it doesn't exist

def download_poster(row, base_url="https://image.tmdb.org/t/p/w500", column="poster_path"):
    poster_url = base_url + row[column]
    local_path = os.path.join(POSTER_DIR, f"{row['id']}.jpg")
    try:
        if not os.path.exists(local_path):
            response = requests.get(poster_url, stream=True, timeout=10)
            if response.status_code == 200:
                with open(local_path, "wb") as file:
                    for chunk in response.iter_content(1024):
                        file.write(chunk)
                return local_path  # Success
            else:
                return f"Failed: HTTP {response.status_code}"  # HTTP Error
        return local_path  # Already exists
    except Exception as e:
        return f"Error: {e}"  # General Exception

# Parallel download function with progress
def download_posters_parallel(df, base_url="https://image.tmdb.org/t/p/w500", column="poster_path"):
    local_paths = []
    failed_urls = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(download_poster, row, base_url, column): row['id'] for _, row in df.iterrows()}

        # Use tqdm to track progress
        with tqdm(total=len(futures), desc="Downloading Posters", unit="poster") as pbar:
            for future in as_completed(futures):
                result = future.result()
                if isinstance(result, str) and (result.startswith("Failed") or result.startswith("Error")):
                    failed_urls.append(result)  # Log failures
                local_paths.append(result)
                pbar.update(1)  # Update progress bar

    # Log failures
    if failed_urls:
        with open("failed_posters.log", "w") as log_file:
            for error in failed_urls:
                log_file.write(error + "\n")
        print(f"{len(failed_urls)} posters failed to download. See 'failed_posters.log' for details.")

    return local_paths

In [13]:
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from IPython.display import Image, display
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import requests

# Add local poster paths to the DataFrame
movie_sample['local_poster_path'] = download_posters_parallel(movie_sample)

Downloading Posters: 100%|██████████| 100/100 [00:02<00:00, 47.19poster/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_sample['local_poster_path'] = download_posters_parallel(movie_sample)


In [14]:
def extract_features_in_batches(image_paths, batch_size=32, target_size=(32, 32)):
    base_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg')  # Load pre-trained model
    features_list = []

    # Process images in batches with tqdm for progress tracking
    for i in tqdm(range(0, len(image_paths), batch_size), desc="Feature Extraction", unit="batch"):
        batch_paths = image_paths[i:i + batch_size]
        batch_images = []

        # Load and preprocess images
        for path in batch_paths:
            try:
                img = load_img(path, target_size=target_size)  # Resize image
                img = img_to_array(img)  # Convert to array
                img = tf.keras.applications.mobilenet_v2.preprocess_input(img)  # Preprocess for MobileNetV2
                batch_images.append(img)
            except Exception as e:
                print(f"Error loading image {path}: {e}")
                batch_images.append(np.zeros((target_size[0], target_size[1], 3)))  # Placeholder for missing images

        batch_images = np.array(batch_images)  # Convert batch to NumPy array
        batch_features = base_model.predict(batch_images, batch_size=batch_size, verbose=0)  # Extract features
        features_list.append(batch_features)

    return np.vstack(features_list)

In [15]:
image_paths = movie_sample['local_poster_path'].dropna().tolist()  # List of image paths
output_file = "poster_features_32x32.npy"  # Output file for saving features

In [16]:
all_features = extract_features_in_batches(image_paths=image_paths, batch_size=32, target_size=(32, 32))
np.save(output_file, all_features)  # Save features to disk

  base_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg')  # Load pre-trained model


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Feature Extraction: 100%|██████████| 4/4 [00:05<00:00,  1.45s/batch]


In [17]:
loaded_features = np.load(output_file)
print(f"Loaded {len(loaded_features)} features from {output_file}")

Loaded 100 features from poster_features_32x32.npy


In [18]:
loaded_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [19]:
movie_sample['poster_features'] = list(loaded_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_sample['poster_features'] = list(loaded_features)


In [20]:
movie_sample.head()

Unnamed: 0,id,title,vote_average,vote_count,revenue,runtime,budget,original_language,original_title,overview,popularity,poster_path,genres,poster_url,local_poster_path,poster_features
0,27205,Inception,8.364,34495,825532764,148,160000000,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,"Action, Science Fiction, Adventure",https://image.tmdb.org/t/p/w500/oYuLEt3zVCKq57...,/content/drive/MyDrive/Colab Notebooks/Posters...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,157336,Interstellar,8.417,32571,701729206,169,165000000,en,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,"Adventure, Drama, Science Fiction",https://image.tmdb.org/t/p/w500/gEU2QniE6E77NI...,/content/drive/MyDrive/Colab Notebooks/Posters...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,155,The Dark Knight,8.512,30619,1004558444,152,185000000,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,"Drama, Action, Crime, Thriller",https://image.tmdb.org/t/p/w500/qJ2tW6WMUDux91...,/content/drive/MyDrive/Colab Notebooks/Posters...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,19995,Avatar,7.573,29815,2923706026,162,237000000,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,"Action, Adventure, Fantasy, Science Fiction",https://image.tmdb.org/t/p/w500/kyeqWdyUXW608q...,/content/drive/MyDrive/Colab Notebooks/Posters...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,24428,The Avengers,7.71,29166,1518815515,143,220000000,en,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,"Science Fiction, Action, Adventure",https://image.tmdb.org/t/p/w500/RYMX2wcKCBAr24...,/content/drive/MyDrive/Colab Notebooks/Posters...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [21]:
from textblob import TextBlob

# Compute sentiment polarity for movie overviews
movie_sample['overview_sentiment'] = movie_sample['overview'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_sample['overview_sentiment'] = movie_sample['overview'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)


In [22]:
from sklearn.preprocessing import MinMaxScaler

# Normalize movie popularity
scaler = MinMaxScaler()
movie_sample['popularity_normalized'] = scaler.fit_transform(movie_sample[['popularity']])

# Normalize Spotify popularity (if not already normalized)
spotify_sample['popularity_normalized'] = scaler.fit_transform(spotify_sample[['popularity']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_sample['popularity_normalized'] = scaler.fit_transform(movie_sample[['popularity']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_sample['popularity_normalized'] = scaler.fit_transform(spotify_sample[['popularity']])


In [23]:
# Normalize movie runtime
movie_sample['runtime_normalized'] = scaler.fit_transform(movie_sample[['runtime']])

# Normalize Spotify tempo
spotify_sample['tempo_normalized'] = scaler.fit_transform(spotify_sample[['tempo']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_sample['runtime_normalized'] = scaler.fit_transform(movie_sample[['runtime']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_sample['tempo_normalized'] = scaler.fit_transform(spotify_sample[['tempo']])


In [24]:
movie_features = ['vote_average', 'vote_count', 'revenue', 'runtime', 'popularity_normalized', 'runtime_normalized', 'overview_sentiment']
spotify_features = ['popularity_normalized', 'tempo_normalized', 'danceability', 'energy',
                    'loudness', 'valence', 'speechiness', 'acousticness', 'instrumentalness']

In [25]:
class EmbeddingSimilarityPipeline:
    def __init__(self, movie_features_to_normalize, spotify_features_to_normalize):
        self.movie_features_to_normalize = movie_features_to_normalize
        self.spotify_features_to_normalize = spotify_features_to_normalize
        self.scaler_movie = MinMaxScaler()
        self.scaler_spotify = MinMaxScaler()

    def normalize_features(self, data, features, scaler):
        """
        Normalize selected features.
        """
        return scaler.fit_transform(data[features])

    def create_movie_embeddings(self, movie_sample):
        """
        Combine normalized numeric features, poster embeddings, sentiment, and placeholder features for movies.
        """
        normalized_features = self.normalize_features(movie_sample, self.movie_features_to_normalize, self.scaler_movie)
        poster_features = np.array(movie_sample['poster_features'].tolist())
        sentiment_features = movie_sample[['overview_sentiment']].values

        # Add placeholder features to match Spotify's normalized features count
        placeholder_normalized = np.zeros((normalized_features.shape[0], 2))  # Adding 2 placeholders

        return np.hstack([normalized_features, placeholder_normalized, poster_features, sentiment_features])

    def create_spotify_embeddings(self, spotify_sample):
        """
        Combine normalized numeric features for Spotify songs and add placeholder features to match movies.
        """
        spotify_numeric = self.normalize_features(spotify_sample, self.spotify_features_to_normalize, self.scaler_spotify)
        placeholder_poster_features = np.zeros((spotify_numeric.shape[0], 1280))  # Match movie poster embedding size
        placeholder_sentiment = np.zeros((spotify_numeric.shape[0], 1))  # Match sentiment feature size

        return np.hstack([spotify_numeric, placeholder_poster_features, placeholder_sentiment])

    def calculate_similarity(self, song_embedding, movie_embeddings):
        """
        Compute similarity between a song embedding and all movie embeddings.
        """
        return cosine_similarity(song_embedding.reshape(1, -1), movie_embeddings)[0]

    def get_top_n_movies(self, song_embedding, movie_embeddings, movie_sample, spotify_sample, song_index, n=5):
        """
        Retrieve the top N most similar movies for a given song embedding and include song features.
        """
        similarities = self.calculate_similarity(song_embedding, movie_embeddings)
        top_indices = np.argsort(similarities)[-n:][::-1]
        top_scores = similarities[top_indices]

        # Extract movie details
        top_movies = movie_sample.iloc[top_indices].copy()
        top_movies['similarity_score'] = top_scores

        # Add song details
        song_details = spotify_sample.iloc[song_index].to_dict()  # Get details of the selected song
        for key, value in song_details.items():
            top_movies[f"song_{key}"] = value  # Prefix song features with "song_"

        return top_movies

    def run(self, movie_sample, spotify_sample, song_index=0, top_n=5):
        """
        Execute the pipeline:
        - Create embeddings for movies and Spotify songs.
        - Compute similarities and retrieve top N results.
        """
        print("Creating movie embeddings...")
        movie_embeddings = self.create_movie_embeddings(movie_sample)

        print("Creating Spotify embeddings...")
        spotify_embeddings = self.create_spotify_embeddings(spotify_sample)

        print(f"Calculating similarities for song at index {song_index}...")
        song_embedding = spotify_embeddings[song_index]
        top_movies = self.get_top_n_movies(song_embedding, movie_embeddings, movie_sample, spotify_sample, song_index, top_n)

        return top_movies

In [26]:
# Import necessary modules
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the pipeline
pipeline = EmbeddingSimilarityPipeline(movie_features, spotify_features)

In [27]:
spotify_sample['track_name'].unique()

array(['Comedy', 'Ghost - Acoustic', 'To Begin Again',
       "Can't Help Falling In Love", 'Hold On', 'Days I Will Remember',
       'Say Something', "I'm Yours", 'Lucky', 'Hunger',
       'Give Me Your Forever', "I Won't Give Up", 'Solo', 'Bad Liar',
       'Hold On - Remix', 'Falling in Love at a Coffee Shop',
       'ily (i love you baby)', 'At My Worst', 'Photograph', 'Demons',
       '93 Million Miles', 'Unlonely', 'Bella Luna', 'Winter Wonderland',
       'If It Kills Me', 'All I Want For Christmas Is A Real Good Tan',
       'Party of One', 'Lonely This Christmas', 'Throwing Good After Bad',
       'This Time Tomorrow', 'The Haves', "When You're Wrong",
       'You and Me on the Rock',
       'Speak Your Mind (From the Netflix Series "We The People")'],
      dtype=object)

In [28]:
def get_recommendations_for_song(pipeline, movie_sample, spotify_sample, song_name, top_n=5):
    """
    Retrieve the top N movie recommendations for a given song name from spotify_sample.
    """
    # Find the index of the song in spotify_sample
    try:
        song_index = spotify_sample[spotify_sample['track_name'] == song_name].index[0]
    except IndexError:
        print(f"Song '{song_name}' not found in the Spotify sample.")
        return None

    # Get top N movies for the given song
    top_movies = pipeline.run(movie_sample, spotify_sample, song_index=song_index, top_n=top_n)

    return top_movies

In [29]:
# Example usage
song_name = "Days I Will Remember"  # Replace with your desired song
top_5_movies_for_song = get_recommendations_for_song(pipeline, movie_sample, spotify_sample, song_name)

# Display the top 5 movies
if top_5_movies_for_song is not None:
    top_5_movies_for_song = top_5_movies_for_song.rename(columns={'title': 'Recommended Movie'})
    top_5_movies_for_song = top_5_movies_for_song.rename(columns={'song_album_name': 'Song Album Name'})
    top_5_movies_for_song = top_5_movies_for_song.rename(columns={'song_artists': 'Song Artist Name'})
    print(f"Top 5 movies for the song '{song_name}':")
    print(top_5_movies_for_song[['Recommended Movie', 'similarity_score','Song Album Name','Song Artist Name']])

Creating movie embeddings...
Creating Spotify embeddings...
Calculating similarities for song at index 5...
Top 5 movies for the song 'Days I Will Remember':
                               Recommended Movie  similarity_score  \
71             The Hobbit: An Unexpected Journey          0.934949   
57                       Spider-Man: No Way Home          0.925735   
26                                    Iron Man 3          0.923180   
83                        The Amazing Spider-Man          0.922232   
48  Harry Potter and the Deathly Hallows: Part 2          0.917858   

         Song Album Name Song Artist Name  
71  Days I Will Remember     Tyrone Wells  
57  Days I Will Remember     Tyrone Wells  
26  Days I Will Remember     Tyrone Wells  
83  Days I Will Remember     Tyrone Wells  
48  Days I Will Remember     Tyrone Wells  


In [30]:
def get_recommendations_for_all_songs(pipeline, movie_sample, spotify_sample, top_n=5):
    """
    Generate a DataFrame with recommendations for all songs in the Spotify dataset.
    """
    all_recommendations = []  # List to store recommendations for all songs

    print("Creating movie embeddings...")
    movie_embeddings = pipeline.create_movie_embeddings(movie_sample)

    print("Creating Spotify embeddings...")
    spotify_embeddings = pipeline.create_spotify_embeddings(spotify_sample)

    #iterate over each song in the Spotify dataset
    for song_index in range(len(spotify_sample)):
        print(f"Calculating similarities for song at index {song_index}...")

        #get the embedding for the current song
        song_embedding = spotify_embeddings[song_index]

        #calculate similarities
        similarities = pipeline.calculate_similarity(song_embedding, movie_embeddings)

        #get top N movie recommendations
        top_indices = np.argsort(similarities)[-top_n:][::-1]  # Top N indices with highest similarity scores
        top_scores = similarities[top_indices]

        #create a DataFrame for the current song's recommendations
        song_recommendations = movie_sample.iloc[top_indices].copy()
        song_recommendations['similarity_score'] = top_scores

        #add song details to the DataFrame
        song_details = spotify_sample.iloc[song_index].to_dict()  # Get song details
        for key, value in song_details.items():
            song_recommendations[f"song_{key}"] = value  # Prefix song details with "song_"

        #append to the list
        all_recommendations.append(song_recommendations)

    #concatenate all recommendations into a single DataFrame
    final_recommendations_df = pd.concat(all_recommendations, ignore_index=True)
    return final_recommendations_df

#example usage
recommendations_df = get_recommendations_for_all_songs(pipeline, movie_sample, spotify_sample, top_n=5)

Creating movie embeddings...
Creating Spotify embeddings...
Calculating similarities for song at index 0...
Calculating similarities for song at index 1...
Calculating similarities for song at index 2...
Calculating similarities for song at index 3...
Calculating similarities for song at index 4...
Calculating similarities for song at index 5...
Calculating similarities for song at index 6...
Calculating similarities for song at index 7...
Calculating similarities for song at index 8...
Calculating similarities for song at index 9...
Calculating similarities for song at index 10...
Calculating similarities for song at index 11...
Calculating similarities for song at index 12...
Calculating similarities for song at index 13...
Calculating similarities for song at index 14...
Calculating similarities for song at index 15...
Calculating similarities for song at index 16...
Calculating similarities for song at index 17...
Calculating similarities for song at index 18...
Calculating similar

In [31]:
recommendations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 43 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          250 non-null    int64  
 1   title                       250 non-null    object 
 2   vote_average                250 non-null    float64
 3   vote_count                  250 non-null    int64  
 4   revenue                     250 non-null    int64  
 5   runtime                     250 non-null    int64  
 6   budget                      250 non-null    int64  
 7   original_language           250 non-null    object 
 8   original_title              250 non-null    object 
 9   overview                    250 non-null    object 
 10  popularity                  250 non-null    float64
 11  poster_path                 250 non-null    object 
 12  genres                      250 non-null    object 
 13  poster_url                  250 non

In [33]:
# Save to a specific folder in your Google Drive
file_path = '/content/drive/My Drive/all_recommendations.csv'
recommendations_df.to_csv(file_path, index=False)

print(f"File saved to {file_path}")

File saved to /content/drive/My Drive/all_recommendations.csv
