# Import libraries and load data

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from load_movie_data import *
from movies_EDA import *
%matplotlib inline

In [2]:
movies = load_movie_data()
movies.head()

Unnamed: 0,movieId,title,genres,Western,(no genres listed),Comedy,Horror,Documentary,War,Thriller,...,Adventure,Animation,Action,Children,Drama,Romance,Musical,mean_rating,num_ratings,weighted_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,0,0,0,0,...,1,1,0,1,0,0,0,0.77733,68469.0,0.777319
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0.649317,27143.0,0.649337
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0.634796,15585.0,0.634842
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0.574908,2989.0,0.575345
4,5,Father of the Bride Part II (1995),Comedy,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.615458,15474.0,0.615517


# Functions to analyze data

In [3]:
def filter_genre(df, genres):
    """
    Filter movie dataframe by genre(s)
    
    Parameters:
        df (pd DataFrame): dataframe containing movie title, genre
                        information, and rating information
        genres (str or list of str): genres to filter dataframe by

    Returns:
        df_filtered: select rows of df with specified genres
    """

    # If just one genre given, simple filter
    if isinstance(genres, str):
        return df[df[genres.capitalize()]==1]

    # If list of genres, filter iteratively
    else:
        df_filtered = df.copy()
        for g in genres:
            df_filtered = df_filtered[df_filtered[g.capitalize()]==1]
        return df_filtered

In [4]:
def highest_rated(df):
    """
    Return highest-rated movies in dataframe
    
    Parameters:
        df (pd DataFrame): dataframe containing movie title, genre
                        information, and rating information
        genres (str or list of str): genres to filter dataframe by

    Returns:
        titles of 5 highest-rated movies
    """

    df_sorted = df.sort_values(by=['weighted_rating'], ascending=False)
    num_movies = df_sorted.shape[0]
    
    # If fewer than 5, return all
    if num_movies < 5:
        return df_sorted['title']
    
    # If at least 5, return 5
    else:
        return df_sorted.iloc[:5]['title']


# Test functions with data

First, look at all rom-coms.

In [5]:
filter_genre(movies, ['Romance', 'Comedy'])

Unnamed: 0,movieId,title,genres,Western,(no genres listed),Comedy,Horror,Documentary,War,Thriller,...,Adventure,Animation,Action,Children,Drama,Romance,Musical,mean_rating,num_ratings,weighted_rating
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0.634796,15585.0,0.634842
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0.574908,2989.0,0.575345
6,7,Sabrina (1995),Comedy|Romance,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0.674270,15301.0,0.674290
10,11,"American President, The (1995)",Comedy|Drama|Romance,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0.732056,19669.0,0.732042
38,39,Clueless (1995),Comedy|Romance,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0.683257,29422.0,0.683265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58043,193771,Little Italy (2018),Comedy|Romance,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0.666667,3.0,0.696992
58054,193795,Forgiving the Franklins (2006),Comedy|Drama|Romance,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,1.000000,1.0,0.732808
58060,193807,"Cor, Blimey! (2000)",Comedy|Documentary|Drama|Romance,0,0,1,0,1,0,0,...,0,0,0,0,1,1,0,0.400000,1.0,0.678263
58067,193821,Nappily Ever After (2018),Comedy|Romance,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0.366667,3.0,0.627761


Now, look at highest-rated movies across all genres.

In [6]:
highest_rated(movies)

47937                  Planet Earth II (2016)
42845                     Planet Earth (2006)
315          Shawshank Redemption, The (1994)
47791                 Band of Brothers (2001)
49411    Black Mirror: White Christmas (2014)
Name: title, dtype: object

Look at highest-rated rom-coms.

In [7]:
highest_rated(filter_genre(movies, ['Romance', 'Comedy']))

2240            Life Is Beautiful (La Vita è bella) (1997)
29189     Operation 'Y' & Other Shurik's Adventures (1965)
888                           It Happened One Night (1934)
1172                            Princess Bride, The (1987)
4878     Amelie (Fabuleux destin d'Amélie Poulain, Le) ...
Name: title, dtype: object