# IMDb Movie Recommender
Full dataset can be found at https://developer.imdb.com/non-commercial-datasets/

Steps:
1. Run `pip install -r requirements.txt`
2. Download these compressed files:
    - `title.basics.tsv.gz`
    - `title.crew.tsv.gz`
    - `title.ratings.tsv.gz`
3. Run the notebook

In [1]:
# import necessary libraries and ML tools
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Load and preview the dataset

In [2]:
# load dataframes from tsv files
title_basics = pd.read_csv('title.basics.tsv.gz', usecols=[0,1,2,8], sep='\t', header=0) # types, names, and genres

In [3]:
title_basics

Unnamed: 0,tconst,titleType,primaryTitle,genres
0,tt0000001,short,Carmencita,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,"Comedy,Short"
...,...,...,...,...
10747307,tt9916848,tvEpisode,Episode #3.17,"Action,Drama,Family"
10747308,tt9916850,tvEpisode,Episode #3.19,"Action,Drama,Family"
10747309,tt9916852,tvEpisode,Episode #3.20,"Action,Drama,Family"
10747310,tt9916856,short,The Wind,Short


In [4]:
title_crew = pd.read_csv('title.crew.tsv.gz', sep='\t', header=0) # directors and writers

In [5]:
title_crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N
...,...,...,...
10110205,tt9916848,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10110206,tt9916850,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
10110207,tt9916852,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
10110208,tt9916856,nm10538645,nm6951431


In [6]:
title_ratings = pd.read_csv('title.ratings.tsv.gz', sep='\t', header=0) # average ratings and popularity

In [7]:
title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2052
1,tt0000002,5.7,275
2,tt0000003,6.5,2006
3,tt0000004,5.4,179
4,tt0000005,6.2,2772
...,...,...,...
1434343,tt9916730,7.0,12
1434344,tt9916766,7.1,23
1434345,tt9916778,7.2,36
1434346,tt9916840,7.0,9


In [8]:
# join dataframes, filter out titles that aren't movies, select needed columns, and clean out null values
movies_df = pd.merge(
    pd.merge(
        title_basics[title_basics['titleType'] == 'movie'], 
        title_crew, on='tconst', how='left'
    ), 
    title_ratings, on='tconst'
)[['primaryTitle', 'genres', 'directors', 'writers', 'averageRating', 'numVotes']].replace(r'\N', '').fillna('')

In [9]:
movies_df

Unnamed: 0,primaryTitle,genres,directors,writers,averageRating,numVotes
0,Miss Jerry,Romance,nm0085156,nm0085156,5.4,211
1,The Corbett-Fitzsimmons Fight,"Documentary,News,Sport",nm0714557,,5.2,512
2,Bohemios,,nm0063413,"nm0063413,nm0657268,nm0675388",4.4,17
3,The Story of the Kelly Gang,"Action,Adventure,Biography",nm0846879,nm0846879,6.0,895
4,The Prodigal Son,Drama,nm0141150,nm0141150,5.4,24
...,...,...,...,...,...,...
309325,Coven,"Drama,History",nm1893148,"nm1893148,nm3471432",6.4,5788
309326,The Secret of China,"Adventure,History,War",nm0910951,,3.4,18
309327,Kuambil Lagi Hatiku,Drama,nm4457074,"nm4843252,nm4900525,nm2679404",8.6,7
309328,Dankyavar Danka,Comedy,nm7764440,nm7933903,7.6,5


In [10]:
# create a dataframe of more popular movies
popular_movies_df = movies_df[movies_df['numVotes'] >= 10000].reset_index(drop=True)

In [11]:
popular_movies_df

Unnamed: 0,primaryTitle,genres,directors,writers,averageRating,numVotes
0,The Birth of a Nation,"Drama,History,War",nm0000428,"nm0228746,nm0000428,nm0940488",6.1,26348
1,Intolerance,"Drama,History",nm0000428,"nm0048512,nm0115218,nm0000428,nm0002616,nm0640...",7.7,16727
2,Broken Blossoms,"Drama,Romance",nm0000428,"nm0121885,nm0000428",7.2,11064
3,The Cabinet of Dr. Caligari,"Horror,Mystery,Thriller",nm0927468,"nm0562346,nm0417917",8.0,70086
4,The Kid,"Comedy,Drama,Family",nm0000122,nm0000122,8.2,134886
...,...,...,...,...,...,...
11136,Moonage Daydream,"Biography,Documentary,History",nm0605137,nm0605137,7.6,14463
11137,I Care a Lot,"Comedy,Crime,Drama",nm2128335,nm2128335,6.4,146207
11138,Coffee & Kareem,"Action,Comedy,Crime",nm0236226,nm3377973,5.2,14581
11139,Kaithi,"Action,Crime,Drama",nm7992231,"nm7992231,nm7807469,nm8325456",8.4,42491


### Take user input and give movie recommendations

In [12]:
# take full or partial movie titles from user input
input_movies = input('Enter one or more of your favorite movies, separated by commas:\n').split(',')
recs_df = ''

try:
    # collect data for each movie
    full_input_movie_titles = [] # list of full movie titles from user input
    combined_movie_features = '' # combined features of input movies
    
    for movie in input_movies:
        attributes = popular_movies_df[popular_movies_df['primaryTitle'].str.contains(movie.strip(), case=False)]
        full_input_movie_titles.append(attributes['primaryTitle'].values[0])
        for movie_features in attributes[['genres', 'directors', 'writers']].values[0]:
            combined_movie_features += movie_features + ' '

    # build the features series
    features = popular_movies_df['genres'] + ' ' + popular_movies_df['directors'] + ' ' + popular_movies_df['writers']
    features = pd.concat([features, pd.Series([combined_movie_features])], ignore_index=True)
    
    # calculate the cosine similarity using movie features
    cv = CountVectorizer()
    cv_matrix = cv.fit_transform(features)
    cos_sim = cosine_similarity(cv_matrix)

    # obtain a list of recommended movies
    recommended_movies = list(enumerate(cos_sim[-1]))
    recommended_movies.pop() # last element is the combined movie features, so we delete it

    # sort movies by decreasing similarity
    recommended_movies_sorted = sorted(recommended_movies, key=lambda x:x[1], reverse=True)

    # return a list of movies, ordered from most to least recommended
    max_recs = int(input('\nHow many recommendations would you like to see?\n')) # max number of recommendations
    assert(max_recs > 0)
    
    print("\nIf you liked:")
    for movie in full_input_movie_titles:
        print('•', movie)
    print('\nYou might also like these movies:')
    
    num_recs = 1
    recs_arr = []
    for movie in recommended_movies_sorted:
        movie_attributes = popular_movies_df[popular_movies_df.index == movie[0]]
        title = movie_attributes['primaryTitle'].values[0]
        avg_rating = movie_attributes['averageRating'].values[0]
        num_votes = '{:,}'.format(movie_attributes['numVotes'].values[0])
        rec_score = round(movie[1], 2)
        
        if title not in full_input_movie_titles: # don't recommend a movie the user already watched!
            recs_arr.append([title, avg_rating, num_votes, rec_score])
            num_recs += 1
            if num_recs > max_recs:
                break
    
    recs_df = pd.DataFrame(recs_arr, columns=['🎥 Title', '⭐ Average Rating', '👥 Number of Votes', '📊 Recommender Score'])

# error handling
except IndexError:
    print('\nSorry, one or more of the titles you entered do not exist in our records. Please try again.')
except (ValueError, AssertionError):
    print('\nPlease enter a valid number.')

recs_df


If you liked:
• Star Wars: Episode IV - A New Hope
• Jurassic Park
• Avengers: Infinity War

You might also like these movies:


Unnamed: 0,🎥 Title,⭐ Average Rating,👥 Number of Votes,📊 Recommender Score
0,The Lost World: Jurassic Park,6.5,444228,0.71
1,Avengers: Endgame,8.4,1266815,0.7
2,Captain America: The First Avenger,6.9,897202,0.67
3,Captain America: The Winter Soldier,7.7,899545,0.64
4,Captain America: Civil War,7.8,850544,0.63
5,Spider-Man,7.4,880321,0.61
6,War of the Worlds,6.5,475459,0.61
7,The Incredible Hulk,6.6,523724,0.61
8,Solo: A Star Wars Story,6.9,379390,0.61
9,Star Wars: Episode I - The Phantom Menace,6.5,857056,0.6
