In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies_df = pd.read_csv('tmdb_5000_movies.csv')

In [5]:
credits_column_renamed = credits.rename(index = str, columns={'movie_id':'id'})
movies_df_merge = movies_df.merge(credits_column_renamed, on='id')

In [6]:
movies_cleaned_df = movies_df_merge.drop(columns = ['homepage', 'title_x', 'title_y', 'status', 'production_countries'])

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df = 3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range=(1,3),
                     stop_words='english')

In [6]:
movies_cleaned_df['overview'] = movies_cleaned_df['overview'].fillna('') 

In [7]:
tfv_matrix = tfv.fit_transform(movies_cleaned_df['overview'])

In [14]:
from sklearn.metrics.pairwise import sigmoid_kernel

In [15]:
sig = sigmoid_kernel(tfv_matrix,tfv_matrix)

In [30]:
# Reverse mapping of indices and movie titles
indices = pd.Series(movies_cleaned_df.index, index=movies_cleaned_df['original_title']).drop_duplicates()

In [33]:
def give_rec(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return movies_cleaned_df['original_title'].iloc[movie_indices]

In [36]:
# Testing our content-based recommendation system with the seminal film Spy Kids
give_rec('Apollo 18')

1275                        Sunshine
311     The Adventures of Pluto Nash
847                         Semi-Pro
0                             Avatar
487                       Red Planet
770                    Event Horizon
635                        Apollo 13
4108       In the Shadow of the Moon
3624                            Moon
2626                      Idle Hands
Name: original_title, dtype: object