In [102]:
import warnings
warnings.filterwarnings('ignore')

In [103]:
import numpy as np
import pandas as pd

In [104]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [105]:
movies = movies.merge(credits, on = 'title')

In [106]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [107]:
# Genres
# id
# keywords
# title
# Overview
# ast
# crew

movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [108]:
movies.dropna(inplace = True)

In [109]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [110]:
import ast
ast.literal_eval(movies.iloc[0].genres)

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [111]:
def convert(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return  l

In [112]:
movies['genres'] = movies['genres'].apply(convert)

In [113]:
movies['keywords'] = movies['keywords'].apply(convert)

In [114]:
def convert3(obj):
    l = []
    counter =0
    for i in ast.literal_eval(obj):
        if counter !=3:
            l.append(i['name'])
            counter +=1
        else:
            break
    return  l

In [115]:
movies['cast'] = movies['cast'].apply(convert3)

In [116]:
def fetch_director(obj):
    l = []
    counter =0
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            l.append(i['name'])
            break
    return  l

In [117]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [118]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [119]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [120]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [121]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [122]:
new_df = movies[['movie_id', 'title', 'tags']]

In [123]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [124]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [125]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [126]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [127]:
import nltk

In [128]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [129]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [130]:
new_df['tags'] = new_df['tags'].apply(stem)

In [131]:
from sklearn.metrics.pairwise import cosine_similarity

In [132]:
similarity = cosine_similarity(vectors)

In [142]:
sorted(list(enumerate(similarity[0])),reverse=True, key = lambda x:x[1])[1:6]

[(539, 0.26089696604360174),
 (1194, 0.2581988897471611),
 (507, 0.25302403842552984),
 (260, 0.25110592822973776),
 (1216, 0.24944382578492943)]

In [143]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [146]:
recommend('Inception')

Duplex
The Helix... Loaded
Star Trek II: The Wrath of Khan
Timecop
Chicago Overcoat


In [147]:
import pickle

In [148]:
pickle.dump(new_df, open('movies.pkl', 'wb'))

In [149]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [150]:
movies = pd.read_pickle('movies.pkl')