In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from load_movie_data import *
from search_movie_data import *
%matplotlib inline

In [2]:
movies = load_movie_data()

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9556 entries, 0 to 9568
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             9556 non-null   int64  
 1   title               9556 non-null   object 
 2   genres              9556 non-null   object 
 3   Sci-Fi              9556 non-null   int32  
 4   Action              9556 non-null   int32  
 5   Fantasy             9556 non-null   int32  
 6   Children            9556 non-null   int32  
 7   Horror              9556 non-null   int32  
 8   (no genres listed)  9556 non-null   int32  
 9   Documentary         9556 non-null   int32  
 10  Mystery             9556 non-null   int32  
 11  Animation           9556 non-null   int32  
 12  War                 9556 non-null   int32  
 13  Adventure           9556 non-null   int32  
 14  Drama               9556 non-null   int32  
 15  Western             9556 non-null   int32  
 16  Musica

# Handle genome tags

The larger version of this dataset contains scores for how well each of 1128 tags applies to each movie. This may be useful for a content-based recommender, but the data need to be reformatted in order to be useful.

The file `genome-scores.csv` has a table of each movie's relevance score for each tag (shape: (num_tags * num_movies, 3)):

| movieId | tagId | relevance |
|---------|-------|-----------|
|       1 |     1 |     0.029 |
|       1 |     2 |   0.02375 |
|       1 |     3 |   0.05425 |
|       1 |     4 |   0.06875 |
|     ... |   ... |       ... |

The file `genome-tags.csv` maps tagId onto the content of the tag (shape: (num_tags, 2)):

|tagId|	         tag|
|-----|-------------|
|    1|            7|
|    2| 007 (series)|
|    3| 18th century|
|    4|        1920s|
|  ...|          ...|

A potentially useful format for this data would combine the tables as follows (shape: (num_movies, num_tags)):

| movieId |     7 |     007 | 18th century |   1920s | ... |
|---------|-------|---------|--------------|---------|-----|
|       1 | 0.029 | 0.02375 |      0.05425 | 0.06875 | ... |
|       2 |   ... |     ... |          ... |     ... | ... |
|       3 |   ... |     ... |          ... |     ... | ... |


In [4]:
def load_genome_tags():
    """
    Load and format genome score and genome tag data.
    """

    # Load csv files with genome scores and tag IDs
    filepath = "C:\\Users\\nicol\\Google Drive\\Datasets\\MovieLens-Large\\"
    genome_scores = pd.read_csv(filepath + "genome-scores.csv")
    genome_tags = pd.read_csv(filepath + "genome-tags.csv")

    # Pivot table to have each tag be one column and each movie be one row
    genome_tag_df = genome_scores.pivot(index='movieId', columns='tagId', values='relevance')

    # Set column names to be tag names
    genome_tag_df.columns = genome_tags['tag'].values

    return genome_tag_df

In [5]:
genome_tag_df = load_genome_tags()
genome_tag_df.head()

Unnamed: 0_level_0,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.029,0.02375,0.05425,0.06875,0.16,0.19525,0.076,0.252,0.2275,0.024,...,0.03775,0.0225,0.04075,0.03175,0.1295,0.0455,0.02,0.0385,0.09125,0.02225
2,0.03625,0.03625,0.08275,0.08175,0.102,0.069,0.05775,0.101,0.08225,0.0525,...,0.04775,0.0205,0.0165,0.0245,0.1305,0.027,0.01825,0.01225,0.09925,0.0185
3,0.0415,0.0495,0.03,0.09525,0.04525,0.05925,0.04,0.1415,0.04075,0.032,...,0.058,0.02375,0.0355,0.02125,0.12775,0.0325,0.01625,0.02125,0.09525,0.0175
4,0.0335,0.03675,0.04275,0.02625,0.0525,0.03025,0.02425,0.07475,0.0375,0.024,...,0.049,0.03275,0.02125,0.03675,0.15925,0.05225,0.015,0.016,0.09175,0.015
5,0.0405,0.05175,0.036,0.04625,0.055,0.08,0.0215,0.07375,0.02825,0.02375,...,0.05375,0.02625,0.0205,0.02125,0.17725,0.0205,0.015,0.0155,0.08875,0.01575


Now each movie's relevance score for each tag is readily accessible, but there are more movies worth of tags than there are in the movies dataset.

In [6]:
print(f"There are {movies.shape[0]} movies in the 'movies' dataframe")
print(f"There are {genome_tag_df.shape[0]} movies in the 'genome_tag_df' dataframe")

There are 9556 movies in the 'movies' dataframe
There are 13176 movies in the 'genome_tag_df' dataframe


In [7]:
movie_id_title = movies[['movieId', 'title']]
movie_genome_tags = movie_id_title.merge(genome_tag_df.reset_index(), how='left', left_on='movieId', right_on='movieId')

In [8]:
movie_genome_tags

Unnamed: 0,movieId,title,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
0,1,Toy Story,0.02900,0.02375,0.05425,0.06875,0.16000,0.19525,0.07600,0.25200,...,0.03775,0.02250,0.04075,0.03175,0.12950,0.04550,0.02000,0.03850,0.09125,0.02225
1,2,Jumanji,0.03625,0.03625,0.08275,0.08175,0.10200,0.06900,0.05775,0.10100,...,0.04775,0.02050,0.01650,0.02450,0.13050,0.02700,0.01825,0.01225,0.09925,0.01850
2,3,Grumpier Old Men,0.04150,0.04950,0.03000,0.09525,0.04525,0.05925,0.04000,0.14150,...,0.05800,0.02375,0.03550,0.02125,0.12775,0.03250,0.01625,0.02125,0.09525,0.01750
3,4,Waiting to Exhale,0.03350,0.03675,0.04275,0.02625,0.05250,0.03025,0.02425,0.07475,...,0.04900,0.03275,0.02125,0.03675,0.15925,0.05225,0.01500,0.01600,0.09175,0.01500
4,5,Father of the Bride Part II,0.04050,0.05175,0.03600,0.04625,0.05500,0.08000,0.02150,0.07375,...,0.05375,0.02625,0.02050,0.02125,0.17725,0.02050,0.01500,0.01550,0.08875,0.01575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9551,189713,BlacKkKlansman,,,,,,,,,...,,,,,,,,,,
9552,190183,The Darkest Minds,,,,,,,,,...,,,,,,,,,,
9553,190209,Jeff Ross Roasts the Border,,,,,,,,,...,,,,,,,,,,
9554,190213,John From,,,,,,,,,...,,,,,,,,,,


# Get most relevant tags for each movie

In [74]:
tag_names = genome_tag_df.columns.values

def get_most_relevant_tags(df, title, num=10):
    # Slice row with movie
    movie = df.loc[df['title']==title]
    try:
        x = movie[tag_names].squeeze().sort_values(ascending=False)
        return x[:num]
    except TypeError:
        if movie.shape[0] == 0:
            print('Error: I do not recognize that title')
        elif movie.shape[0] > 1:
            print('Error: More than one movie with that title')
        else:
            print('Unknown error')
        return

In [76]:
def get_relevant_tag_soup(df, title):
    # Slice row with movie
    movie = df.loc[df['title']==title]
    try:

        x = movie[tag_names].squeeze().sort_values(ascending=False)
        relevant_tags = list(x[x>=0.75].index)
        tag_list = [process_tag(x) for x in relevant_tags]
        relevant_tag_soup = ' '.join(tag_list)
    
    except TypeError:
        if movie.shape[0] == 0:
            print('Error: I do not recognize that title')
        elif movie.shape[0] > 1:
            print('Error: More than one movie with that title')
        else:
            print('Unknown error')
        return
    
    return relevant_tag_soup

In [77]:
get_most_relevant_tags(movie_genome_tags, 'FindingNemo')

Error: I do not recognize that title


In [79]:
get_relevant_tag_soup(movie_genome_tags, 'FindingNemo')

Error: I do not recognize that title


Add tag soup to movies dataframe.

In [18]:
soup = []

for i in range(movie_genome_tags.shape[0]):
    x = movie_genome_tags.loc[i][tag_names].squeeze().sort_values(ascending=False)
    relevant_tags = list(x[x>=0.75].index)
    tag_list = [process_tag(x) for x in relevant_tags]
    relevant_tag_soup = ' '.join(tag_list)
    soup.append(relevant_tag_soup)

In [19]:
soup[:3]

['toys computeranimation pixaranimation animation kidsandfamily kids pixar cartoon animated children friendship imdbtop250 story adventure childhood greatmovie light original unlikelyfriendships disneyanimatedfeature nostalgic good fun disney family cute heartwarming great cgi funmovie classic oscarbestanimatedfeature clever oscarwinner originalplot',
 'adventure children fantasy kids jungle specialeffects animals fantasyworld family funmovie lions childhood videogame bigbudget fun',
 'sequel goodsequel sequels comedy original']

In [20]:
len(soup)

9556

In [21]:
movie_genome_tags['relevant_tag_soup'] = soup
movie_genome_tags.head()

Unnamed: 0,movieId,title,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,...,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies,relevant_tag_soup
0,1,Toy Story,0.029,0.02375,0.05425,0.06875,0.16,0.19525,0.076,0.252,...,0.0225,0.04075,0.03175,0.1295,0.0455,0.02,0.0385,0.09125,0.02225,toys computeranimation pixaranimation animatio...
1,2,Jumanji,0.03625,0.03625,0.08275,0.08175,0.102,0.069,0.05775,0.101,...,0.0205,0.0165,0.0245,0.1305,0.027,0.01825,0.01225,0.09925,0.0185,adventure children fantasy kids jungle special...
2,3,Grumpier Old Men,0.0415,0.0495,0.03,0.09525,0.04525,0.05925,0.04,0.1415,...,0.02375,0.0355,0.02125,0.12775,0.0325,0.01625,0.02125,0.09525,0.0175,sequel goodsequel sequels comedy original
3,4,Waiting to Exhale,0.0335,0.03675,0.04275,0.02625,0.0525,0.03025,0.02425,0.07475,...,0.03275,0.02125,0.03675,0.15925,0.05225,0.015,0.016,0.09175,0.015,women chickflick girliemovie romantic
4,5,Father of the Bride Part II,0.0405,0.05175,0.036,0.04625,0.055,0.08,0.0215,0.07375,...,0.02625,0.0205,0.02125,0.17725,0.0205,0.015,0.0155,0.08875,0.01575,goodsequel sequel sequels pregnancy fatherdaug...


In [22]:
movies = movies.merge(movie_genome_tags[['movieId', 'relevant_tag_soup']],
                      how='inner', left_on='movieId', right_on='movieId')
movies.head()

Unnamed: 0,movieId,title,genres,Sci-Fi,Action,Fantasy,Children,Horror,(no genres listed),Documentary,...,Crime,Film-Noir,Comedy,IMAX,mean_rating,num_ratings,weighted_rating,tag_soup,year,relevant_tag_soup
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,0,0,0,...,0,0,1,0,0.77733,68469,0.777122,animated buddymovie cartoon cgi comedy compute...,1995,toys computeranimation pixaranimation animatio...
1,2,Jumanji,Adventure|Children|Fantasy,0,0,1,1,0,0,0,...,0,0,0,0,0.649317,27143,0.649732,fantasy adaptedfrombook animals badcgi basedon...,1995,adventure children fantasy kids jungle special...
2,3,Grumpier Old Men,Comedy|Romance,0,0,0,0,0,0,0,...,0,0,1,0,0.634796,15585,0.6357,moldy old annmargaret burgessmeredith darylhan...,1995,sequel goodsequel sequels comedy original
3,4,Waiting to Exhale,Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,1,0,0.574908,2989,0.583135,characters girlmovie characters chickflick bas...,1995,women chickflick girliemovie romantic
4,5,Father of the Bride Part II,Comedy,0,0,0,0,0,0,0,...,0,0,1,0,0.615458,15474,0.616615,stevemartin stevemartin pregnancy remake aging...,1995,goodsequel sequel sequels pregnancy fatherdaug...


# Compute cosine similarity between movie tags

In [23]:
movie_indices = pd.Series(movies.index, index=movies['title'])

In [24]:
movie_indices['Finding Dory']

9130

In [25]:
def compute_cosine_similarity(df, var='tag_soup'):

    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(df[var])

    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    return cosine_sim

In [26]:
def get_similar_title(title, cosine_sim):

    idx = movie_indices[title]
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    mov_ind = [i[0] for i in sim_scores]

    return movies['title'].iloc[mov_ind]

Compute cosine similarity between movies based on user-assigned tags.

In [27]:
cosine_sim_user_tags = compute_cosine_similarity(movies)
get_similar_title('Finding Dory', cosine_sim_user_tags)

4317           Finding Nemo
1739          Bug's Life, A
0                 Toy Story
7291            Toy Story 3
2334            Toy Story 2
8114    Monsters University
3533         Monsters, Inc.
3709                Ice Age
6154                   Cars
5323       Incredibles, The
Name: title, dtype: object

Compute cosine similarity between movies based on genome tag relevance scores.

In [28]:
cosine_sim_genome_tags = compute_cosine_similarity(movies, 'relevant_tag_soup')
get_similar_title('Finding Dory', cosine_sim_genome_tags)

4317         Finding Nemo
5990       Chicken Little
3865      Stuart Little 2
8823    The Good Dinosaur
2334          Toy Story 2
6840                 Bolt
1739        Bug's Life, A
2358        Stuart Little
4528      Beethoven's 2nd
6202       Ant Bully, The
Name: title, dtype: object

In [31]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tag_soup'])
cosine_sim_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix)

In [34]:
len(tfidf.get_feature_names())

48035

In [32]:
tfidf.get_feature_names()[5000:5010]

['bettywhite',
 'bettywhitetasticbuttherestisformula',
 'betweenfantasticrealistic',
 'betweenlifeanddeath',
 'betweentwopillows',
 'beulahbondi',
 'beverleyelliott',
 'beverlyanderson',
 'beverlydangelo',
 'beverlyhills']

In [41]:
get_similar_title('Notting Hill', cosine_sim_tfidf)

311                           Four Weddings and a Funeral
3129                                Bridget Jones's Diary
3794                                          About a Boy
401     Englishman Who Went Up a Hill But Came Down a ...
4597                                        Love Actually
507                                          Pretty Woman
156                                           Nine Months
6357                                     Music and Lyrics
4102                                     Two Weeks Notice
5329                    Bridget Jones: The Edge of Reason
Name: title, dtype: object

In [42]:
get_similar_title('Notting Hill', cosine_sim_user_tags)

3129                                Bridget Jones's Diary
311                           Four Weddings and a Funeral
507                                          Pretty Woman
4597                                        Love Actually
3794                                          About a Boy
401     Englishman Who Went Up a Hill But Came Down a ...
7663                                  What's Your Number?
6357                                     Music and Lyrics
2027                                        Runaway Bride
5926                                     Just Like Heaven
Name: title, dtype: object

In [43]:
get_similar_title('Notting Hill', cosine_sim_genome_tags)

8752                      Man Up
5822            Lot Like Love, A
4597               Love Actually
466         Sleepless in Seattle
6993               Proposal, The
8176                     What If
507                 Pretty Woman
1163    My Best Friend's Wedding
7367          Life as We Know It
6293                Holiday, The
Name: title, dtype: object