# IMDb Movie Recommender
Full dataset can be found at https://www.imdb.com/interfaces/

In [1]:
# import necessary libraries and ML tools
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load and preview the dataset

In [2]:
# load dataframes from tsv files
title_basics = pd.read_csv('title_basics.tsv', usecols=[0,1,2,8], sep='\t', header=0) # types, names, and genres

In [3]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,genres
0,tt0000001,short,Carmencita,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,"Comedy,Short"
...,...,...,...,...
7557508,tt9916848,tvEpisode,Episode #3.17,"Action,Drama,Family"
7557509,tt9916850,tvEpisode,Episode #3.19,"Action,Drama,Family"
7557510,tt9916852,tvEpisode,Episode #3.20,"Action,Drama,Family"
7557511,tt9916856,short,The Wind,Short


In [4]:
title_crew = pd.read_csv('title_crew.tsv', sep='\t', header=0) # directors and writers

In [5]:
title_crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N
...,...,...,...
7560243,tt9916848,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
7560244,tt9916850,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
7560245,tt9916852,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
7560246,tt9916856,nm10538645,nm6951431


In [6]:
title_ratings = pd.read_csv('title_ratings.tsv', sep='\t', header=0) # average ratings and popularity

In [7]:
title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1677
1,tt0000002,6.1,208
2,tt0000003,6.5,1404
3,tt0000004,6.2,123
4,tt0000005,6.1,2198
...,...,...,...
1116390,tt9916580,7.2,5
1116391,tt9916690,6.6,5
1116392,tt9916720,6.2,72
1116393,tt9916766,6.9,15


In [8]:
# join dataframes, filter out titles that aren't movies, select needed columns, and clean out null values
movies_df = pd.merge(
    pd.merge(
        title_basics[title_basics['titleType'] == 'movie'], 
        title_crew, on='tconst', how='left'
    ), 
    title_ratings, on='tconst'
)[['primaryTitle', 'genres', 'directors', 'writers', 'averageRating', 'numVotes']].replace(r'\N', '').fillna('')

In [9]:
movies_df

Unnamed: 0,primaryTitle,genres,directors,writers,averageRating,numVotes
0,Bohemios,,nm0063413,"nm0063413,nm0657268,nm0675388",4.5,8
1,The Story of the Kelly Gang,"Biography,Crime,Drama",nm0846879,nm0846879,6.1,613
2,Robbery Under Arms,Drama,nm0533958,"nm0092809,nm0533958",4.5,14
3,Hamlet,Drama,nm0143333,nm0000636,3.2,11
4,Don Quijote,Drama,nm0194088,nm0148859,4.9,9
...,...,...,...,...,...,...
258992,Safeguard,"Action,Adventure,Thriller",nm7308376,nm7308376,3.2,133
258993,Il talento del calabrone,Thriller,nm1480867,"nm1480867,nm10538402",5.7,848
258994,Coven,"Action,Adventure,Drama",nm1893148,"nm1893148,nm3471432",5.9,198
258995,The Secret of China,"Adventure,History,War",nm0910951,,3.8,12


In [10]:
# create a dataframe of more popular movies (at least 5/10 stars and 10k ratings)
popular_movies_df = movies_df[(movies_df['averageRating'] >= 5) & (movies_df['numVotes'] >= 10000)].reset_index(drop=True)

In [11]:
popular_movies_df

Unnamed: 0,primaryTitle,genres,directors,writers,averageRating,numVotes
0,The Birth of a Nation,"Drama,History,War",nm0000428,"nm0228746,nm0000428,nm0940488",6.3,22720
1,Intolerance: Love's Struggle Throughout the Ages,"Drama,History",nm0000428,"nm0048512,nm0115218,nm0000428,nm0002616,nm0640...",7.7,14226
2,The Cabinet of Dr. Caligari,"Fantasy,Horror,Mystery",nm0927468,"nm0562346,nm0417917",8.1,57353
3,The Kid,"Comedy,Drama,Family",nm0000122,nm0000122,8.3,113099
4,The Phantom Carriage,"Drama,Fantasy,Horror",nm0803705,"nm0481248,nm0803705",8.1,10664
...,...,...,...,...,...,...
8460,I Lost My Body,"Animation,Drama,Fantasy",nm3021346,"nm3021346,nm0491011",7.6,26467
8461,Falling Inn Love,"Comedy,Romance",nm0474955,"nm4306620,nm1194570",5.6,15738
8462,Holidate,"Comedy,Romance",nm0925870,nm0667330,6.1,43613
8463,Coffee & Kareem,"Action,Comedy,Crime",nm0236226,nm3377973,5.1,11616


## Take user input and give movie recommendations

In [12]:
# take full or partial movie titles from user input
input_movies = input('Enter one or more of your favorite movies, separated by commas:\n').split(',')

try:
    # collect data for each movie
    full_input_movie_titles = [] # list of full movie titles from user input
    combined_movie_features = '' # combined features of input movies
    
    for movie in input_movies:
        attributes = popular_movies_df[popular_movies_df['primaryTitle'].str.contains(movie.strip(), case=False)]
        full_input_movie_titles.append(attributes['primaryTitle'].values[0])
        for movie_features in attributes[['genres', 'directors', 'writers']].values[0]:
            combined_movie_features += movie_features + ' '

    # build the features series
    features = popular_movies_df['genres'] + ' ' + popular_movies_df['directors'] + ' ' + popular_movies_df['writers']
    features = features.append(pd.Series([combined_movie_features]), ignore_index=True)
    
    # calculate the cosine similarity using movie features
    cv = CountVectorizer()
    cv_matrix = cv.fit_transform(features)
    cos_sim = cosine_similarity(cv_matrix)

    # obtain a list of recommended movies
    recommended_movies = list(enumerate(cos_sim[-1]))
    recommended_movies.pop() # last element is the combined movie features, so we delete it

    # sort movies by decreasing similarity
    recommended_movies_sorted = sorted(recommended_movies, key=lambda x:x[1], reverse=True)

    # return a list of movies, ordered from most to least recommended
    max_recs = int(input('\nHow many recommendations would you like to see?\n')) # max number of recommendations
    
    print("\nIf you liked:")
    for movie in full_input_movie_titles:
        print('"' + movie + '"')
    print('\nYou might also like:')
    
    num_recs = 1
    for movie in recommended_movies_sorted:
        title = popular_movies_df[popular_movies_df.index == movie[0]]['primaryTitle'].values[0]
        if title not in full_input_movie_titles: # don't recommend a movie the user already watched!
            print(str(num_recs) + '.', title)
            num_recs += 1
            if num_recs > max_recs:
                break

# error handling
except IndexError:
    print('\nSorry, one or more of the titles you entered do not exist in our records. Please try again.')
except ValueError:
    print('\nPlease enter a valid number.')

Enter one or more of your favorite movies, separated by commas:
infinity war, star wars, jurassic park

How many recommendations would you like to see?
20

If you liked:
"Avengers: Infinity War"
"Star Wars: Episode IV - A New Hope"
"Jurassic Park"

You might also like:
1. The Lost World: Jurassic Park
2. Avengers: Endgame
3. Captain America: The First Avenger
4. Captain America: Civil War
5. Captain America: The Winter Soldier
6. Iron Man 2
7. Spider-Man
8. The Incredible Hulk
9. Solo: A Star Wars Story
10. Star Wars: Episode I - The Phantom Menace
11. Star Wars: Episode III - Revenge of the Sith
12. Ready Player One
13. The Last Starfighter
14. Stealth
15. Transformers: Dark of the Moon
16. Transformers: Age of Extinction
17. Bumblebee
18. Avengers: Age of Ultron
19. Timeline
20. Jurassic World: Fallen Kingdom
