In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import scale

## Load Data

In [2]:
df_movies = pd.read_csv('./movies_metadata_clean.csv', encoding='ISO-8859-1')
df_users = pd.read_csv('./movies_ratings_small.csv', encoding='ISO-8859-1')

## Build recommendations

1. content based filtering

In [3]:
df_movies.shape

# Remove duplicates
df_movies.drop_duplicates(subset='title',keep='first',inplace=True)
df_movies.shape

(45463, 24)

(42277, 24)

In [4]:
# Prepare description column

df_movies['tagline'] = df_movies['tagline'].fillna('')
df_movies['description'] = df_movies['overview'] + df_movies['tagline']
df_movies['description'] = df_movies['description'].fillna('')
df_movies['description']

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
45456    It's the year 3000 AD. The world's most danger...
45458    Rising and falling between a man and woman.Ris...
45459    An artist struggles to finish his work while a...
45461    In a small town live two brothers, one a minis...
45462    50 years after decriminalisation of homosexual...
Name: description, Length: 42277, dtype: object

## Build model

In [5]:
## Generate a matrix of common terms that show up in each movie

from sklearn.feature_extraction.text import TfidfVectorizer

model = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')

tfidf_matrix = model.fit_transform(df_movies['description'])
tfidf_matrix.shape

(42277, 1047434)

In [6]:
# Calculate the cosine similarity between each pair of movies

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
cosine_sim.shape

(42277, 42277)