# FINAL PROJECT (Movie Recommendation System)

#### Group mates:
-   Ritchel Rey Etchorre - 101358231
-   James MacAloney - 101362896
-   Stefan Kepinski - 101356431
-   Vincent Nhar Calonzo - 101272540

In [956]:
import pandas as pd
import numpy as np
import ast

In [957]:
moviesDF = pd.read_csv('http://bit.ly/imdbratings')

In [958]:
moviesDF.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [959]:
# checks the datatype of each column
moviesDF.dtypes

star_rating       float64
title              object
content_rating     object
genre              object
duration            int64
actors_list        object
dtype: object

In [960]:
# Checks the type of the value in 'actors_list' column
print(moviesDF['actors_list'][0],' - ',type(moviesDF['actors_list'][0]))

[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunton']  -  <class 'str'>


In [961]:
# converts the actors_list's values from string to a list
def sanitize_actors_list(actors_string):
    actors_list = ast.literal_eval(actors_string) # converts the string representation of a list into an actual list object.
    sanitized_actors = [actor.replace("u'", "").replace("'", "") for actor in actors_list] # It iterates to the whole list cleaning up the unwanted characters from the list of strings to prepare data for further processing
    return sanitized_actors

In [962]:
# apply the sanitation of values for actors_list Series
moviesDF['actors_list'] = moviesDF['actors_list'].apply(sanitize_actors_list)

In [963]:
moviesDF.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[Tim Robbins, Morgan Freeman, Bob Gunton]"
1,9.2,The Godfather,R,Crime,175,"[Marlon Brando, Al Pacino, James Caan]"
2,9.1,The Godfather: Part II,R,Crime,200,"[Al Pacino, Robert De Niro, Robert Duvall]"
3,9.0,The Dark Knight,PG-13,Action,152,"[Christian Bale, Heath Ledger, Aaron Eckhart]"
4,8.9,Pulp Fiction,R,Crime,154,"[John Travolta, Uma Thurman, Samuel L. Jackson]"


In [964]:
# Checking the values in content_rating series to furthur simplify it
moviesDF['content_rating'].unique()

array(['R', 'PG-13', 'NOT RATED', 'PG', 'UNRATED', 'APPROVED', 'PASSED',
       'G', 'X', nan, 'TV-MA', 'GP', 'NC-17'], dtype=object)

In [965]:
# Drop movies that has a content rating of 'nan', 'UNRATED', 'NOT RATED' just it will be hard for algorithm to determine if content is allowed for audience under 17

# replace all movies with 'NOT RATED' to nan
# so we can easily drop all movies with content_rating 'nan'
moviesDF.replace('UNRATED', np.nan, inplace=True)
moviesDF.replace('NOT RATED', np.nan, inplace=True)


moviesDF.dropna(inplace=True)


#### Content Rating Description
<b>R:</b> Restricted. Persons under 17 require an accompanying parent or adult guardian.<br><br>
<b>PG-13:</b> Parents Strongly Cautioned. Some material may be inappropriate for children under 13.<br><br>
<b>PG:</b> Parental Guidance Suggested. Some material may not be suitable for children.<br><br>
<b>APPROVED:</b> Generally used in older films, this rating indicates the film was approved for all audiences.<br><br>
<b>PASSED:</b> Another older rating indicating the film was considered appropriate for all audiences.<br><br>
<b>G:</b> General Audiences. All ages admitted.<br><br>
<b>X:</b> This rating was used in the past to indicate adult content but has largely been replaced by the NC-17 rating.<br><br>
<b>TV-MA:</b> Suitable for mature audiences only. Content may be unsuitable for children under 17.<br><br>
<b>GP:</b> General audiences, parental guidance suggested.<br><br>
<b>NC-17:</b> No One 17 and Under Admitted. This rating restricts admission to adults 18 years and older due to the content's explicit nature.<br><br>

In [966]:
# Based on Content Rating Description, we can furthur simplify the values by making the similar content rating have the same values

# R and TV-MA both isn't suitable for children under 17
moviesDF.replace('TV-MA', 'R', inplace=True)

# GP movies and PG movies and PG-13 movies can also be merge under 1 content rating. 
# Children such as 17 to 13 will be able to watch with a warning of PG (Parental Guidance).
# Not suitable(Restricted) for children under 13 
moviesDF.replace("GP", "PG", inplace=True)
moviesDF.replace("PG-13", "PG", inplace=True)

# APPROVED, PASSED, G are all basically the same
moviesDF.replace("APPROVED", "G", inplace=True)
moviesDF.replace("PASSED", "G", inplace=True)

# NC-17 and X could be basically the same thing for only allowing adults to see these type of movies so they can share the same value
moviesDF.replace("NC-17", "X", inplace=True)


In [967]:
# Check if we successfully simplify our values for content_rating
moviesDF['content_rating'].unique()

array(['R', 'PG', 'G', 'X'], dtype=object)

In [968]:
# Checks for any missing values for any series in the dataframe
moviesDF.isnull().sum()

star_rating       0
title             0
content_rating    0
genre             0
duration          0
actors_list       0
dtype: int64

In [969]:
# genre and actors must be a list even if user only input 1 genre/actor

def recommendation_system(moviesDF,user_age,fav_genre_list,fav_actors_list):

    # Age Criteria
    if user_age < 13:
        # For users younger than 13,only include movies with a content rating of 'G' (General Audiences)
        moviesDF = moviesDF[moviesDF['content_rating'] == 'G']
    elif user_age < 18:
        # For users younger than 18, exclude movies with content ratings 'R' and 'X'
        # '~' negates the conditoin, selecting movies where the content_rating is NOT 'R' (Restricted) or 'X' (Adult Content)
        moviesDF = moviesDF[~moviesDF['content_rating'].isin(['R', 'X'])]
    

    # Genre Criteria
    # Assign a score based on the presence of favorite genre
    moviesDF.loc[:,'genre_score'] = moviesDF['genre'].apply(lambda x: x in fav_genre_list)

    # Actors/Actresses Criteria
    # Check for the presence of favorite actors
    moviesDF.loc[:,'actors_present'] = moviesDF['actors_list'].apply(lambda x: any(actor in x for actor in fav_actors_list))


    # Sorting based genre score and actors present
    moviesDF = moviesDF.sort_values(['genre_score', 'actors_present'], ascending=[False, False])

    # Dropping temporary columns
    moviesDF.drop(['genre_score', 'actors_present'], axis=1, inplace=True)

    # Reset index
    moviesDF.reset_index(drop=True, inplace=True)
    
    return moviesDF.head(10)

In [970]:
user_age = 22
favourite_genre = ['Crime']
favourite_actor = ['Tom Hanks']

recommendation_system(moviesDF,user_age,favourite_genre,favourite_actor)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,8.5,The Green Mile,R,Crime,189,"[Tom Hanks, Michael Clarke Duncan, David Morse]"
1,7.7,Road to Perdition,R,Crime,117,"[Tom Hanks, Tyler Hoechlin, Rob Maxey]"
2,9.3,The Shawshank Redemption,R,Crime,142,"[Tim Robbins, Morgan Freeman, Bob Gunton]"
3,9.2,The Godfather,R,Crime,175,"[Marlon Brando, Al Pacino, James Caan]"
4,9.1,The Godfather: Part II,R,Crime,200,"[Al Pacino, Robert De Niro, Robert Duvall]"
5,8.9,Pulp Fiction,R,Crime,154,"[John Travolta, Uma Thurman, Samuel L. Jackson]"
6,8.7,City of God,R,Crime,130,"[Alexandre Rodrigues, Matheus Nachtergaele, Le..."
7,8.7,The Usual Suspects,R,Crime,106,"[Kevin Spacey, Gabriel Byrne, Chazz Palminteri]"
8,8.6,Leon: The Professional,R,Crime,110,"[Jean Reno, Gary Oldman, Natalie Portman]"
9,8.6,American History X,R,Crime,119,"[Edward Norton, Edward Furlong, Beverly DAngelo]"


In [971]:
user_age = 17
favourite_genre = ['Adventure']
favourite_actor = ['Tom Hanks']

recommendation_system(moviesDF,user_age,favourite_genre,favourite_actor)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,7.7,Cast Away,PG,Adventure,143,"[Tom Hanks, Helen Hunt, Paul Sanchez]"
1,7.6,Apollo 13,PG,Adventure,140,"[Tom Hanks, Bill Paxton, Kevin Bacon]"
2,8.9,The Lord of the Rings: The Return of the King,PG,Adventure,201,"[Elijah Wood, Viggo Mortensen, Ian McKellen]"
3,8.8,The Lord of the Rings: The Fellowship of the Ring,PG,Adventure,178,"[Elijah Wood, Ian McKellen, Orlando Bloom]"
4,8.8,The Lord of the Rings: The Two Towers,PG,Adventure,179,"[Elijah Wood, Ian McKellen, Viggo Mortensen]"
5,8.7,Interstellar,PG,Adventure,169,"[Matthew McConaughey, Anne Hathaway, Jessica C..."
6,8.5,Back to the Future,PG,Adventure,116,"[Michael J. Fox, Christopher Lloyd, Lea Thompson]"
7,8.4,North by Northwest,G,Adventure,136,"[Cary Grant, Eva Marie Saint, James Mason]"
8,8.4,Lawrence of Arabia,PG,Adventure,216,"[Peter OToole, Alec Guinness, Anthony Quinn]"
9,8.3,Monty Python and the Holy Grail,PG,Adventure,91,"[Graham Chapman, John Cleese, Eric Idle]"


In [973]:
user_age = 12
favourite_genre = ['Animation']
favourite_actor = ['Tom Hanks']

recommendation_system(moviesDF,user_age,favourite_genre,favourite_actor)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,8.4,Toy Story 3,G,Animation,103,"[Tom Hanks, Tim Allen, Joan Cusack]"
1,8.3,Toy Story,G,Animation,81,"[Tom Hanks, Tim Allen, Don Rickles]"
2,7.9,Toy Story 2,G,Animation,92,"[Tom Hanks, Tim Allen, Joan Cusack]"
3,8.5,The Lion King,G,Animation,89,"[Matthew Broderick, Jeremy Irons, James Earl J..."
4,8.4,WALL-E,G,Animation,98,"[Ben Burtt, Elissa Knight, Jeff Garlin]"
5,8.3,My Neighbor Totoro,G,Animation,86,"[Hitoshi Takagi, Noriko Hidaka, Chika Sakamoto]"
6,8.2,Finding Nemo,G,Animation,100,"[Albert Brooks, Ellen DeGeneres, Alexander Gould]"
7,8.1,"Monsters, Inc.",G,Animation,92,"[Billy Crystal, John Goodman, Mary Gibbs]"
8,8.1,Beauty and the Beast,G,Animation,84,"[Paige OHara, Robby Benson, Richard White]"
9,8.0,Ratatouille,G,Animation,111,"[Brad Garrett, Lou Romano, Patton Oswalt]"
