In [30]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits, on='title')

In [4]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [5]:
movies.isnull().sum()

movies.dropna(inplace=True)

In [6]:
movies.duplicated().sum()

np.int64(0)

In [7]:
def convert(genres):
    genres = ast.literal_eval(genres)
    L_Genre = []
    for i in genres:
        L_Genre.append(i['name'])
    return L_Genre

In [8]:
movies['genres'] = movies['genres'].apply(convert)

In [9]:
movies['keywords'] = movies['keywords'].apply(convert)

In [10]:
def convertThree(obj):
    L_Cast = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L_Cast.append(i['name'])
            counter += 1
        else:
            break
    return L_Cast

In [11]:
movies['cast'] = movies['cast'].apply(convertThree)

In [12]:
def fetch_director(obj):
    L_Crew = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L_Crew.append(i['name'])
            break
    return L_Crew

In [13]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [14]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [15]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [16]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [17]:
movies['genres'] = movies['genres'].apply(lambda x:[ i.replace(' ', '') for i in x])

movies['keywords'] = movies['keywords'].apply(lambda x:[ i.replace(' ', '') for i in x])

movies['cast'] = movies['cast'].apply(lambda x:[ i.replace(' ', '') for i in x])

movies['crew'] = movies['crew'].apply(lambda x:[ i.replace(' ', '') for i in x])

In [18]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


In [19]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [20]:
new_movies = movies[['movie_id', 'title', 'tags']]

In [21]:
new_movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [22]:
new_movies['tags'] = new_movies['tags'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags'] = new_movies['tags'].apply(lambda x: ' '.join(x))


In [23]:
new_movies['tags'] = new_movies['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags'] = new_movies['tags'].apply(lambda x:x.lower())


In [24]:
new_movies['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [31]:
ps = PorterStemmer()

In [33]:
def stem(text):
    L = []
    for i in text.split():
        L.append(ps.stem(i))
    return ' '.join(L)

new_movies['tags'] = new_movies['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags'] = new_movies['tags'].apply(stem)


In [34]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [35]:
vectors = cv.fit_transform(new_movies['tags']).toarray()

In [36]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])