In [87]:
import numpy as np
import pandas as pd
import ast 

In [88]:
# read csv files of movies and credits. 
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [89]:
# let merge our dataframe into one. 
movies = movies.merge(credits, on='title')


In [90]:
# let do data preprocessing and remove unwanted columns from our movies dataframe and keep the related one for our recommendation system
# id, overview, title, genere, keywords, cast and crew

movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [91]:
#lets remove the null data
movies.dropna(inplace=True)

In [92]:
#Let clean our data columns of genres by fetching the name values and make it list. 
def FetchListFromObject(object):
    list = []
    for obj in ast.literal_eval(object):
        list.append(obj['name'])
    
    return list

def FetchListFromObjectForCast(object):
    list = []
    counter = 0
    for obj in ast.literal_eval(object):
        if counter < 3:
            list.append(obj['name'])
            counter+=1

    return list

def FetchListFromObjectForCrew(object):
    list = []
    for obj in ast.literal_eval(object):
        if obj['job'] == 'Director':
            list.append(obj['name'])
    return list

In [93]:
movies['genres']= movies['genres'].apply(FetchListFromObject)
movies['keywords']= movies['keywords'].apply(FetchListFromObject)
movies['cast']= movies['cast'].apply(FetchListFromObjectForCast)
movies['crew']= movies['crew'].apply(FetchListFromObjectForCrew)

In [94]:
#split the overview string into the list of strings
movies['overview']= movies['overview'].apply(lambda x:x.split())

In [95]:
# let apply transformation on words which have two words in a name such as Sam Worthington, so we need to remove space
# between them so our recommendation system consider it a single entity and not confuse it with another person with the name sam.

movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [96]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [97]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [105]:
final_movies = movies[['movie_id','title','tags']]
final_movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [106]:
# finally we now convert our list of tags into single string
final_movies['tags'] = final_movies['tags'].apply(lambda x:" ".join(x))

final_movies

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_movies['tags'] = final_movies['tags'].apply(lambda x:" ".join(x))


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."
...,...,...,...
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic..."
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...


In [107]:
final_movies['tags'] = final_movies['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_movies['tags'] = final_movies['tags'].apply(lambda x:x.lower())


In [109]:
final_movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [115]:

#lets first to the stem of each word in our tags
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def Stem(text):
    stemList = []
    for i in text.split():
        stemList.append(ps.stem(i))
        
    return " ".join(stemList)

final_movies['tag'] = final_movies['tags'].apply(Stem)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_movies['tag'] = final_movies['tags'].apply(Stem)


In [128]:
# we will build our model
# Step 1: convert our 'tags' column to vectors with the help of text vectorization (bag of words).
# Step 2: find the cosine distance between each of the vector to find the similarity.
# Step 3: Create a Recommend function which will return the 5 recommended movies.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer(max_features=5000, stop_words='english')

#Step1
movies_vector = cv.fit_transform(final_movies['tag']).toarray()

#Step2
similarity_matrix = cosine_similarity(movies_vector)

#Step3
def Recommend(movie):
    movie_idx = final_movies[final_movies['title'] == movie].index[0]
    distance_similarity = similarity_matrix[movie_idx]
    # we use enumerate function, so that our indexes not get corrupt while sorting.
    # then we sort the data and fetch top 5 similar movies.
    recommend_movies_list = sorted(list(enumerate(distance_similarity)), reverse=True, key=lambda x:x[1])[1:6] 
    for i in recommend_movies_list:
        print(final_movies.iloc[i[0]].title)



In [132]:
# Now we use pickle to send our data for external use
import pickle

pickle.dump(final_movies,open('movies.pkl','wb'))
pickle.dump(similarity_matrix,open('similarity_matrix.pkl','wb'))

In [133]:
final_movies

Unnamed: 0,movie_id,title,tags,tag
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...","in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...","captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,following the death of district attorney harve...,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former military ca...","john carter is a war-weary, former militari ca..."
...,...,...,...,...
4804,9367,El Mariachi,el mariachi just wants to play his guitar and ...,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlywed couple's honeymoon is upended by th...,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduces a dedic...","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambitious new york attorney sam is sent t...,when ambiti new york attorney sam is sent to s...
