In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
from tensorflow.keras.models import Model

In [None]:
#  Import and read the movie data
movies_df = pd.read_csv("https://www.dropbox.com/s/q8yp2pkdsjazbph/movies_data.csv?dl=1", index_col='id')
movies_df.head()

Unnamed: 0_level_0,title,genres,original_language,overview,popularity,production_companies,release_date,runtime,vote_average,vote_count,credits,keywords,poster_path,recommendations
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
76600,Avatar: The Way of Water,Science Fiction-Adventure-Action,en,Set more than a decade after the events of the...,9366.788,20th Century Studios-Lightstorm Entertainment,2022-12-14,192.0,7.751,6748.0,Sam Worthington-Zoe Saldaña-Sigourney Weaver-S...,loss of loved one-dying and death-alien life-f...,/t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,183392-111332-702432-505642-1064215-436270-874...
758323,The Pope's Exorcist,Horror-Mystery-Thriller,en,Father Gabriele Amorth Chief Exorcist of the V...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,103.0,7.433,545.0,Russell Crowe-Daniel Zovatto-Alex Essoe-Franco...,spain-rome italy-vatican-pope-pig-possession-c...,/9JBEPLTPSm0d1mbEcLxULjJq9Eh.jpg,713704-296271-502356-1076605-1084225-1008005-9...
594767,Shazam! Fury of the Gods,Action-Comedy-Fantasy,en,Billy Batson and his foster siblings who trans...,5759.074,New Line Cinema-The Safran Company-DC Films-Wa...,2023-03-15,130.0,6.84,1355.0,Zachary Levi-Asher Angel-Jack Dylan Grazer-Ada...,superhero-end of the world-super power-aftercr...,/2VK4d3mqqTc7LVZLnLPeRiPaJ71.jpg,700391-994751-948713-640146-502356-938992-7660...
640146,Ant-Man and the Wasp: Quantumania,Action-Adventure-Science Fiction,en,Super-Hero partners Scott Lang and Hope van Dy...,4704.903,Marvel Studios-Kevin Feige Productions,2023-02-15,125.0,6.448,1547.0,Paul Rudd-Evangeline Lilly-Jonathan Majors-Kat...,hero-ant-sequel-superhero-based on comic-famil...,/ngl2FKBlU4fhbdsrtdom9LVLBXw.jpg,965839-734048-267805-1035806-823999-842942-772...
677179,Creed III,Drama-Action,en,After dominating the boxing world Adonis Creed...,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,116.0,7.262,1129.0,Michael B. Jordan-Tessa Thompson-Jonathan Majo...,philadelphia pennsylvania-husband wife relatio...,/cvsXj3I9Q2iyyIo95AecSd1tad7.jpg,965839-267805-943822-842942-1035806-823999-107...


Data Pre-Processing

In [35]:
from pandas.core.reshape.merge import string
# convert types
movies_df['title'] =  movies_df['title'].apply(str)
movies_df['overview'] =  movies_df['overview'].apply(str)
movies_df['release_date'] =  movies_df['release_date'].astype('datetime64')

In [36]:
for col in ['keywords', 'genres', 'credits']:
    for val in ['-']:
        movies_df[col]=movies_df[col].str.replace(val,',')
    movies_df[col]=movies_df[col].astype(str)

In [50]:
# Function to get a list of unique values out of the column with multiple values, separated by comma
def get_uniques(data,col):
    '''
    data: Dataframe object
    col: column name with comma seperated values
    ---
    returns: a list of unique category values in that column
    '''
    out=set([val.strip().lower() for val in ','.join(data[col].unique()).split('-')])
    try:
        out.remove('')
    except:
        return list(out)
    return list(out)

In [67]:
# List of unique genres
genres = get_uniques(movies_df,'genres')
keywords = get_uniques(movies_df,'keywords') # list of unique keywords
credits = get_uniques(movies_df,'credits') # list of unique credits (actors)
print(genres)

['science fiction,adventure,action,horror,mystery,thriller,action,comedy,fantasy,action,adventure,science fiction,drama,action,action,adventure,thriller,animation,adventure,comedy,family,horror,thriller,action,thriller,crime,comedy,drama,drama,horror,mystery,adventure,fantasy,comedy,war,drama,history,action,romance,action,comedy,crime,action,science fiction,adventure,thriller,action,action,romance,comedy,science fiction,horror,comedy,thriller,comedy,comedy,drama,romance,crime,thriller,drama,action,action,fantasy,science fiction,science fiction,action,adventure,drama,comedy,war,history,drama,fantasy,family,comedy,adventure,family,comedy,fantasy,action,animation,comedy,fantasy,horror,war,fantasy,action,adventure,thriller,horror,thriller,science fiction,thriller,horror,mystery,crime,horror,horror,animation,action,adventure,fantasy,thriller,comedy,crime,mystery,crime,drama,thriller,war,action,drama,action,comedy,crime,thriller,action,drama,history,drama,comedy,fantasy,action,comedy,action,

In [68]:
def extract_unique_elements(data_list):
    # Combine all strings in the list into a single string
    combined_string = ','.join(data_list)

    # Split the combined string by comma (',') to separate the elements
    elements = [element.strip() for element in combined_string.split(',')]

    # Remove duplicates by converting to set and back to list
    unique_elements = list(set(elements))

    return unique_elements

In [65]:
extract_unique_elements(credits)

['baptiste fonck',
 'diane bellego',
 'kaali venkat',
 'tarina pouncy',
 'michael chacon',
 'john leisenring',
 'mark david fritsche',
 'bhawani sankar',
 'sammi "sweetheart" giancola',
 'settimo castagna',
 'erich finsches',
 'lizzie hopley',
 'louis paquette',
 'larry raben',
 'pawel godziak',
 'sergio díaz',
 'zhang',
 'carolyn sadowska',
 'violeta berrios',
 'izabella telezynska',
 'samuel mason paul',
 'ron orbach',
 'petri hiltunen',
 'freddie duke',
 'kevin rahm',
 'marc grapey',
 'michael fawcett',
 'j. michael jaynes',
 'markku huhtamo',
 'melany ochoa',
 'tyler nelson',
 'shaun noble',
 'lucas',
 'jeff shannon',
 'nina campana',
 'riccardo miniggio',
 'william murphy',
 'maki besea',
 'patrick horgan',
 'tedra millan',
 'kendra hesketh',
 'max white',
 'dani levine',
 'shobha mohan',
 'suzanne cryer',
 'adina kelly',
 'rhea sharma',
 'suzanne raffaelli',
 'g. m. sundar',
 'sule nayir',
 'jerry todisco',
 'noémie moncel',
 'maisy mazer',
 'antonio garcía',
 'cristina iosani',


In [69]:
extract_unique_elements(genres)

['science fiction',
 'thriller',
 'drama',
 'adventure',
 'war',
 'crime',
 'action',
 'documentary',
 'animation',
 'romance',
 'tv movie',
 'horror',
 'western',
 'fantasy',
 'mystery',
 'comedy',
 'history',
 'music',
 'family']

In [70]:
extract_unique_elements(keywords)

['narcism',
 'hate relationship',
 'osha',
 'narration',
 'teacher hero',
 'syndrom',
 'fear of intimacy',
 'francisco de goya',
 'renaissance painting',
 'tollywood',
 'edwardian',
 'voice recognition',
 'christmas magic',
 'police custody',
 'exorcist',
 'sexual experimentation',
 'pilgrim',
 'river of slime',
 'usa politics',
 'tug of war',
 'studio',
 'ukrainian war',
 'drowned child',
 'documentary',
 'vicious',
 'ranch',
 'female psychopath',
 'securities stocks and bonds',
 'dismemberment of human beings',
 'san andreas california',
 'body torn apart',
 'timeline',
 'sleeping around',
 'nonviolent resistance',
 'singing around campfire',
 'female serial killer',
 'insurgent',
 'melt',
 'appeal',
 'social media influencer',
 'fraternity house',
 'ancient civilization',
 'holiday movie',
 'stone age',
 'vacation home',
 'senegalese',
 'neat freak',
 'baby king',
 'madison square garden',
 'baseball player',
 'roof',
 'oklahoma city bombing',
 'vespa',
 'hood',
 'artifact',
 'nudis

Feature Engeneering. Create popularity_score feature using vote_average and vote_count

In [40]:
def calculate_popularity(row):
    vote_count = row['vote_count']
    vote_average = row['vote_average']
    # Example formula: Popularity score = (vote_average * vote_count) / (vote_count + m)
    m = movies_df['vote_count'].quantile(0.90)  # Set the threshold based on your preference
    popularity_score = (vote_average * vote_count) / (vote_count + m)
    return popularity_score

In [41]:
# Apply the calculate_popularity function to create the new feature
movies_df['popularity_score'] = movies_df.apply(calculate_popularity, axis=1)

In [42]:
# Print the first few rows of the dataframe with the new feature
movies_df.head()

Unnamed: 0_level_0,title,genres,original_language,overview,popularity,production_companies,release_date,runtime,vote_average,vote_count,credits,keywords,poster_path,recommendations,popularity_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
76600,Avatar: The Way of Water,"Science Fiction,Adventure,Action",en,Set more than a decade after the events of the...,9366.788,20th Century Studios-Lightstorm Entertainment,2022-12-14,192.0,7.751,6748.0,"Sam Worthington,Zoe Saldaña,Sigourney Weaver,S...","loss of loved one,dying and death,alien life,f...",/t6HIqrRAclMCA60NsSmeqe9RmNV.jpg,183392-111332-702432-505642-1064215-436270-874...,6.134474
758323,The Pope's Exorcist,"Horror,Mystery,Thriller",en,Father Gabriele Amorth Chief Exorcist of the V...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,103.0,7.433,545.0,"Russell Crowe,Daniel Zovatto,Alex Essoe,Franco...","spain,rome italy,vatican,pope,pig,possession,c...",/9JBEPLTPSm0d1mbEcLxULjJq9Eh.jpg,713704-296271-502356-1076605-1084225-1008005-9...,1.743709
594767,Shazam! Fury of the Gods,"Action,Comedy,Fantasy",en,Billy Batson and his foster siblings who trans...,5759.074,New Line Cinema-The Safran Company-DC Films-Wa...,2023-03-15,130.0,6.84,1355.0,"Zachary Levi,Asher Angel,Jack Dylan Grazer,Ada...","superhero,end of the world,super power,aftercr...",/2VK4d3mqqTc7LVZLnLPeRiPaJ71.jpg,700391-994751-948713-640146-502356-938992-7660...,2.958062
640146,Ant-Man and the Wasp: Quantumania,"Action,Adventure,Science Fiction",en,Super-Hero partners Scott Lang and Hope van Dy...,4704.903,Marvel Studios-Kevin Feige Productions,2023-02-15,125.0,6.448,1547.0,"Paul Rudd,Evangeline Lilly,Jonathan Majors,Kat...","hero,ant,sequel,superhero,based on comic,famil...",/ngl2FKBlU4fhbdsrtdom9LVLBXw.jpg,965839-734048-267805-1035806-823999-842942-772...,2.999836
677179,Creed III,"Drama,Action",en,After dominating the boxing world Adonis Creed...,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,2023-03-01,116.0,7.262,1129.0,"Michael B. Jordan,Tessa Thompson,Jonathan Majo...","philadelphia pennsylvania,husband wife relatio...",/cvsXj3I9Q2iyyIo95AecSd1tad7.jpg,965839-267805-943822-842942-1035806-823999-107...,2.82017


In [43]:
from sklearn.preprocessing import OneHotEncoder

In [44]:
# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

In [71]:
print(unique_genres)

['science fiction', 'thriller', 'drama', 'adventure', 'war', 'crime', 'action', 'documentary', 'animation', 'romance', 'tv movie', 'horror', 'western', 'fantasy', 'mystery', 'comedy', 'history', 'music', 'family']


In [72]:
# Reshape the genres data to a 2D array
unique_genres = np.array(unique_genres).reshape(-1, 1)

In [73]:
# Perform one-hot encoding
genres_encoded = encoder.fit_transform(unique_genres)

print(genres_encoded)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.

