In [15]:
import pandas as pd
import pickle
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

In [2]:
df = pd.read_csv("data/movies.csv")
df = df[['genres','overview','keywords','title']]
df.head()

Unnamed: 0,genres,overview,keywords,title
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",A cryptic message from Bond’s past sends him o...,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",Spectre
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",Following the death of District Attorney Harve...,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",The Dark Knight Rises
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","John Carter is a war-weary, former military ca...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",John Carter


In [3]:
df.isna().sum()

genres      0
overview    3
keywords    0
title       0
dtype: int64

In [4]:
df.dropna(inplace=True,axis=0)
df.reset_index(inplace=True,drop=True)

In [5]:
nlp = spacy.load("en_core_web_sm")
df['overview'] = df.overview.apply(
    lambda text: 
        " ".join(
            token.lemma_ for token in nlp(str(text))
                if not token.is_punct and token.lemma_.lower() not in nlp.Defaults.stop_words
        )
)

In [6]:
def extract_data(data_frame):
    data_list = []
    data = ""
    genres = data_frame.genres.apply(json.loads)
    keywords = data_frame.keywords.apply(json.loads)
    for i in range(len(df)):
        for j in genres[i]:
            data+= j['name'].replace(" ","-") + " "
        for j in keywords[i]:
            data+= j['name'].replace(" ","-") + " "
        data_list.append(data)
        data = ""
    data_frame['data'] = data_list
    data_frame['data'] = data_frame['data'] + data_frame['overview']
    data_frame.drop(['genres','overview','keywords'],inplace=True,axis=1)
    return data_frame

In [7]:
modified_movies_data = extract_data(df)

In [8]:
modified_movies_data.to_csv("data/modified_movies_data.csv",index=False)

In [9]:
tfidf = TfidfVectorizer()
tf_mat = tfidf.fit_transform(modified_movies_data.data)

In [10]:
cos_sim = cosine_similarity(tf_mat, tf_mat)

In [11]:
indices = pd.Series(df.index,index=df.title)
  
def recommend_movie(title,how_many_movies = 5):
    recommended_movies = []
    index_title = indices[title]
    sim = pd.Series(cos_sim[index_title]).sort_values(ascending=False)[1:how_many_movies+1]
    for i in sim.index:
        recommended_movies.append(indices[indices==i].index[0])
    return recommended_movies

In [12]:
recommend_movie("Avatar")

['Aliens', 'Mission to Mars', 'Alien³', 'Moonraker', 'Silent Running']

In [13]:
recommend_movie("Pirates of the Caribbean: At World's End")

["Pirates of the Caribbean: Dead Man's Chest",
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'Pirates of the Caribbean: On Stranger Tides',
 'The Pirates! In an Adventure with Scientists!',
 "Nim's Island"]

In [17]:
pickle.dump(cos_sim, open("similarity.pkl", 'wb'))