In [14]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from load_movie_data import *
from movies_EDA import *
%matplotlib inline

In [2]:
movies = load_movie_data()

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres,Fantasy,Mystery,IMAX,Thriller,Romance,Crime,Film-Noir,...,(no genres listed),Sci-Fi,Animation,Adventure,War,Horror,mean_rating,num_ratings,weighted_rating,tag_soup
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0.784186,215.0,0.780458,animated buddymovie cartoon cgi comedy compute...
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0.686364,110.0,0.687526,fantasy adaptedfrombook animals badcgi basedon...
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0.651923,52.0,0.659728,moldy old annmargaret burgessmeredith darylhan...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0.471429,7.0,0.606066,characters girlmovie characters chickflick bas...
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.614286,49.0,0.628866,stevemartin stevemartin pregnancy remake aging...


In [4]:
def filter_genre(df, genres):
    # If just one genre given, filter
    if isinstance(genres, str):
        return df[df[genres.capitalize()]==1]
    # If list of genres, filter iteratively
    else:
        df_filtered = df.copy()
        for g in genres:
            df_filtered = df_filtered[df_filtered[g.capitalize()]==1]
        return df_filtered

In [5]:
def highest_rated(df):
    df_sorted = df.sort_values(by=['weighted_rating'], ascending=False)
    num_movies = df_sorted.shape[0]

    # If fewer than 5, return all
    if num_movies < 5:
        return df_sorted['title']

    # If at least 5, return 5
    else:
        return df_sorted.iloc[:5]['title']


In [31]:
def get_similar_title(title, cosine_sim):
    idx = movie_indices[title]
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    mov_ind = [i[0] for i in sim_scores]

    return movies['title'].iloc[mov_ind]

In [42]:
def compute_cosine_similarity(df):

    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(df['tag_soup'])

    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    return cosine_sim

In [49]:
def filter_tag(df, tags):
    # If just one tag given, filter
    if isinstance(tags, str):
        return df[df['tag_soup'].str.contains(tags)]
    # If list of genres, filter iteratively
    else:
        df_filtered = df.copy()
        for t in tags:
            df_filtered = df_filtered[df_filtered['tag_soup'].str.contains(t)]
        return df_filtered

In [6]:
highest_rated(movies)

273     Shawshank Redemption, The (1994)
651                Godfather, The (1972)
2206                   Fight Club (1999)
908       Godfather: Part II, The (1974)
46            Usual Suspects, The (1995)
Name: title, dtype: object

In [7]:
highest_rated(filter_genre(movies, ['Romance', 'Comedy', 'Crime']))

1378                                  Out of Sight (1998)
89                                   Bottle Rocket (1996)
6971                           Brothers Bloom, The (2008)
684                                        Charade (1963)
5871    Edukators, The (Die Fetten Jahre sind vorbei) ...
Name: title, dtype: object

In [10]:
movie_indices = pd.Series(movies.index, index=movies['title'])
movie_indices.head()

title
Toy Story (1995)                      0
Jumanji (1995)                        1
Grumpier Old Men (1995)               2
Waiting to Exhale (1995)              3
Father of the Bride Part II (1995)    4
dtype: int64

In [44]:
cosine_sim = compute_cosine_similarity(movies)

In [33]:
get_similar_title('Finding Nemo (2003)', cosine_sim)

1739          Bug's Life, A (1998)
0                 Toy Story (1995)
3533         Monsters, Inc. (2001)
9136           Finding Dory (2016)
2334            Toy Story 2 (1999)
3709                Ice Age (2002)
8115    Monsters University (2013)
6347            Ratatouille (2007)
7292            Toy Story 3 (2010)
6155                   Cars (2006)
Name: title, dtype: object

In [51]:
filter_tag(movies, ['fantasy', 'funny', 'animated'])

Unnamed: 0,movieId,title,genres,Fantasy,Mystery,IMAX,Thriller,Romance,Crime,Film-Noir,...,(no genres listed),Sci-Fi,Animation,Adventure,War,Horror,mean_rating,num_ratings,weighted_rating,tag_soup
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0.784186,215.0,0.780458,animated buddymovie cartoon cgi comedy compute...
223,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0.846215,251.0,0.840625,scifi spaceaction classicscifi harrisonford mu...
499,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0.75847,183.0,0.755457,wanttoown robinwilliams animation action adven...
773,1025,"Sword in the Stone, The (1963)",Animation|Children|Fantasy|Musical,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0.716,25.0,0.711518,enjoyable animation dull magic disney classic ...
3165,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,1,0,0,0,1,0,0,...,0,0,1,1,0,0,0.773529,170.0,0.769462,fairytale funny kids animation animation based...
3533,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0.774242,132.0,0.769036,funny pixar comedy funny pixar animated animat...
4305,6350,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,Action|Adventure|Animation|Children|Fantasy|Sc...,1,0,0,0,0,0,0,...,0,1,1,1,0,0,0.8125,24.0,0.779503,hayaomiyazaki robots steampunk studioghibli an...
5786,32456,"Pom Poko (a.k.a. Raccoon War, The) (Heisei tan...",Animation|Comedy|Drama|Fantasy,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0.85,2.0,0.725259,animalattacks animalsdie anthropomorphic antia...
5794,32587,Sin City (2005),Action|Crime|Film-Noir|Mystery|Thriller,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0.771429,84.0,0.763863,stylized blackcomedy brutality cannibalism dar...
6347,50872,Ratatouille (2007),Animation|Children|Drama,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0.773611,72.0,0.764672,pixar animation imagination inspirational pixa...
