# Import libraries and load data

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from load_movie_data import *
from movies_EDA import *
%matplotlib inline

In [2]:
movies = load_movie_data()
movies.head()

Unnamed: 0,movieId,title,genres,Fantasy,Documentary,Crime,Film-Noir,(no genres listed),Musical,Thriller,...,War,Horror,Children,Sci-Fi,Animation,Drama,Comedy,mean_rating,num_ratings,weighted_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,0,0,0,0,0,0,...,0,0,1,0,1,0,1,0.784186,215.0,0.780458
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0.686364,110.0,0.687526
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.651923,52.0,0.659728
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0.471429,7.0,0.606066
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.614286,49.0,0.628866


# Functions to analyze data

In [3]:
def filter_genre(df, genres):
    """
    Filter movie dataframe by genre(s)
    
    Parameters:
        df (pd DataFrame): dataframe containing movie title, genre
                        information, and rating information
        genres (str or list of str): genres to filter dataframe by

    Returns:
        df_filtered: select rows of df with specified genres
    """

    # If just one genre given, simple filter
    if isinstance(genres, str):
        return df[df[genres.capitalize()]==1]

    # If list of genres, filter iteratively
    else:
        df_filtered = df.copy()
        for g in genres:
            df_filtered = df_filtered[df_filtered[g.capitalize()]==1]
        return df_filtered

In [4]:
def highest_rated(df):
    """
    Return highest-rated movies in dataframe
    
    Parameters:
        df (pd DataFrame): dataframe containing movie title, genre
                        information, and rating information
        genres (str or list of str): genres to filter dataframe by

    Returns:
        titles of 5 highest-rated movies
    """

    df_sorted = df.sort_values(by=['weighted_rating'], ascending=False)
    num_movies = df_sorted.shape[0]
    
    # If fewer than 5, return all
    if num_movies < 5:
        return df_sorted['title']
    
    # If at least 5, return 5
    else:
        return df_sorted.iloc[:5]['title']


# Test functions with data

First, look at all rom-coms.

In [5]:
filter_genre(movies, ['Romance', 'Comedy'])

Unnamed: 0,movieId,title,genres,Fantasy,Documentary,Crime,Film-Noir,(no genres listed),Musical,Thriller,...,War,Horror,Children,Sci-Fi,Animation,Drama,Comedy,mean_rating,num_ratings,weighted_rating
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.651923,52.0,0.659728
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0.471429,7.0,0.606066
6,7,Sabrina (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.637037,54.0,0.646924
10,11,"American President, The (1995)",Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0.734286,70.0,0.730039
35,39,Clueless (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0.658654,104.0,0.662308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9529,172229,Plain Clothes (1988),Comedy|Mystery|Romance|Thriller,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0.600000,1.0,0.691192
9612,176621,Boniface's Holiday (1965),Animation|Children|Comedy|Romance,0,0,0,0,0,0,0,...,0,0,1,0,1,0,1,0.400000,1.0,0.673010
9628,178615,Front Cover (2016),Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0.700000,1.0,0.700283
9691,184349,Elsa & Fred (2005),Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0.700000,1.0,0.700283


Now, look at highest-rated movies across all genres.

In [6]:
highest_rated(movies)

277     Shawshank Redemption, The (1994)
659                Godfather, The (1972)
2226                   Fight Club (1999)
922       Godfather: Part II, The (1974)
46            Usual Suspects, The (1995)
Name: title, dtype: object

Look at highest-rated rom-coms.

In [7]:
highest_rated(filter_genre(movies, ['Romance', 'Comedy']))

899                            Princess Bride, The (1987)
314                                   Forrest Gump (1994)
3622    Amelie (Fabuleux destin d'Amélie Poulain, Le) ...
680                        Philadelphia Story, The (1940)
1730           Life Is Beautiful (La Vita è bella) (1997)
Name: title, dtype: object