In [42]:
import numpy as np
import pandas as pd

In [43]:
data = pd.read_csv('movie_metadata.csv')

In [44]:
data.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [45]:
# Keeping the columns that are useful in recommendation system
data = data.loc[:,['actor_1_name','actor_2_name','actor_3_name','director_name','genres','movie_title']]

In [46]:
data.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name,genres,movie_title
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron,Action|Adventure|Fantasy|Sci-Fi,Avatar
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski,Action|Adventure|Fantasy,Pirates of the Caribbean: At World's End
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes,Action|Adventure|Thriller,Spectre
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan,Action|Thriller,The Dark Knight Rises
4,Doug Walker,Rob Walker,,Doug Walker,Documentary,Star Wars: Episode VII - The Force Awakens ...


In [47]:
# dealing with null values
data.isnull().sum(axis=0)

actor_1_name       7
actor_2_name      13
actor_3_name      23
director_name    104
genres             0
movie_title        0
dtype: int64

In [48]:
# replacing null values in the columns using 'unknown'
data['actor_1_name'] = data['actor_1_name'].replace(np.nan, 'unknown')
data['actor_2_name'] = data['actor_2_name'].replace(np.nan, 'unknown')
data['actor_3_name'] = data['actor_3_name'].replace(np.nan, 'unknown')
data['director_name'] = data['director_name'].replace(np.nan, 'unknown')

In [49]:
# In 'genres' columns replacing '|' with whitespace
# so that genres would be considered as different strings
data['genres'] = data['genres'].replace('|', ' ')

In [50]:
data['genres'][0]

'Action|Adventure|Fantasy|Sci-Fi'

In [51]:
# Now converting the movies titles to lowercase for searching simplicity
data['movie_title'] = data['movie_title'].str.lower()

In [52]:
# all movies titles have added special character at the end we need to remove that
data['movie_title'][0]

'avatar\xa0'

In [53]:
data['movie_title'] = data['movie_title'].str[:-1]
data['movie_title'][0]

'avatar'

In [54]:
# save this csv file
data.to_csv('data.csv', index=False)

**Create a recommendation model**

In [55]:
import pandas as pd
import numpy as np
#Libraries for count matrix and cosine matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
# Reading the data file from preprocessed data
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name,genres,movie_title
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron,Action|Adventure|Fantasy|Sci-Fi,avatar
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski,Action|Adventure|Fantasy,pirates of the caribbean: at world's end
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes,Action|Adventure|Thriller,spectre
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan,Action|Thriller,the dark knight rises
4,Doug Walker,Rob Walker,unknown,Doug Walker,Documentary,star wars: episode vii - the force awakens ...


In [57]:
#Making a new column containing combination to all features
data['comb'] = data['actor_1_name']+ ''+ data['actor_2_name']+ ''+ data['actor_3_name']+ '' + data['director_name']+ '' +data['genres']


In [58]:
data['comb'][0]

'CCH PounderJoel David MooreWes StudiJames CameronAction|Adventure|Fantasy|Sci-Fi'

In [60]:
#Creating a Count Matrix
cv = CountVectorizer()
count_matrix = cv.fit_transform(data['comb'])

In [61]:
#creating a similarity score matrix
sim = cosine_similarity(count_matrix)

# Saving the similarity score matrix in a file
np.save('similarity_matrix', sim)

In [62]:
sim

array([[1.        , 0.23904572, 0.11952286, ..., 0.        , 0.        ,
        0.        ],
       [0.23904572, 1.        , 0.14285714, ..., 0.        , 0.        ,
        0.        ],
       [0.11952286, 0.14285714, 1.        , ..., 0.14285714, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.14285714, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [63]:
data.to_csv('data.csv',index=False)