In [13]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import textblob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
url = '../data/imdb_raw.csv'
df = pd.read_csv(url)
df.head()

In [None]:
df['data'] = df['title'] + ' ' + df['director'] + ' ' +  df['genre']
df['data']

In [None]:
#remove punctuations - anything that is not a word or a space
df['data'] = df['data'].str.replace('[^\w\s]','')
# Lower case
df['data'] = df['data'].str.lower()

In [None]:
def remove_stopwords(text):
    words = text.split()
    return" ".join(word for word in words if word not in stopwords.words('english'))
df['data'] = df['data'].apply(remove_stopwords)
df['data']

In [11]:
#vectorize the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['data']).toarray()
X.shape

(1000, 2327)

In [14]:
# similarity matrix
similarity = cosine_similarity(X, X)
similarity.shape

(1000, 1000)

In [15]:
def get_index_from_title(title):
    try: 
        return df[df.title == title].index[0]
    except:
        return None

def recommend_movie(title, Limit = 10):
    index = get_index_from_title(title)
    if index is None:
        return []
    else:
        movie_scores = []
        for i in range(similarity.shape[0]):
            movie_scores.append((df['title'][i],similarity[index][i]))
        movie_scores.sort(key=lambda x:x[1],reverse = True)
        return movie_scores[1:Limit+1]

In [16]:
recommend_movie('Harry Potter and the Goblet of Fire',5)

[('Harry Potter and the Prisoner of Azkaban', 0.6666666666666669),
 ("Harry Potter and the Sorcerer's Stone", 0.6092717958449424),
 ('Harry Potter and the Deathly Hallows: Part 2', 0.5833333333333335),
 ('Harry Potter and the Deathly Hallows: Part 1', 0.5833333333333335),
 ('The Wizard of Oz', 0.4811252243246882)]