In [1]:
import pandas as pd
import numpy as np
import ast
import warnings 
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('display.max_columns',None)

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')

In [4]:
movies.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
855,56000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 36, ""name...",,16072,"[{""id"": 6091, ""name"": ""war""}, {""id"": 14643, ""n...",en,Gods and Generals,The film centers mostly around the personal an...,4.362535,"[{""name"": ""Warner Bros. Pictures"", ""id"": 174},...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2003-02-21,12923936,214.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The nations heart was touched by...,Gods and Generals,6.1,48


In [5]:
movies.shape

(4803, 20)

In [6]:
credits  = pd.read_csv('tmdb_5000_credits.csv')

In [7]:
credits.sample()

Unnamed: 0,movie_id,title,cast,crew
4708,354624,Heroes of Dirt,"[{""cast_id"": 0, ""character"": ""Phineas Cooper"",...","[{""credit_id"": ""5694d6329251414b6e001c8a"", ""de..."


In [8]:
credits.shape

(4803, 4)

In [9]:
movies = movies.merge(credits,on='title')

In [10]:
movies.shape

(4809, 23)

In [11]:
movies.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
2963,10800000,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 53, ""nam...",,11917,"[{""id"": 8636, ""name"": ""blood splatter""}, {""id""...",en,Saw V,Detective Hoffman is seemingly the last person...,32.943848,"[{""name"": ""Lionsgate"", ""id"": 1632}, {""name"": ""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2008-10-23,113857533,92.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,You Won't Believe How It Ends,Saw V,5.9,820,11917,"[{""cast_id"": 5, ""character"": ""John Kramer"", ""c...","[{""credit_id"": ""536b9e6e0e0a2647c400c3d8"", ""de..."


In [12]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

### Remove Unwanted columns and select only useful columns

In [13]:
movies = movies[['movie_id','title','overview','genres','keywords','cast', 'crew']]

In [14]:
movies.sample()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
3015,9794,Employee of the Month,When he hears that the new female employee dig...,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...","[{""id"": 1361, ""name"": ""salesclerk""}, {""id"": 15...","[{""cast_id"": 1, ""character"": ""Zack"", ""credit_i...","[{""credit_id"": ""52fe452ec3a36847f80c0d9b"", ""de..."


### Checking for null values

In [15]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

- Since there are only three null values in the overview column, we will drop these rows.

In [16]:
movies.dropna(inplace=True)

In [17]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

### Checking for Duplicate values

In [18]:
movies.duplicated().sum()

0

In [19]:
movies.sample()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
558,35,The Simpsons Movie,After Homer accidentally pollutes the town's w...,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 35, ""...","[{""id"": 494, ""name"": ""father son relationship""...","[{""cast_id"": 2, ""character"": ""Homer / Itchy / ...","[{""credit_id"": ""52fe4211c3a36847f8001521"", ""de..."


### genres

In [20]:
movies['genres'] =  movies['genres'].apply(lambda x : [genre['name'] for genre in ast.literal_eval(x)])

In [21]:
movies.sample()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
3475,92591,Bernie,"In this true story in the tiny, rural town of ...","[Comedy, Crime, Drama]","[{""id"": 17989, ""name"": ""prison visit""}, {""id"":...","[{""cast_id"": 13, ""character"": ""Bernie Tiede"", ...","[{""credit_id"": ""52fe49039251416c750baba5"", ""de..."


### keywords

In [22]:
movies.iloc[1].keywords

'[{"id": 270, "name": "ocean"}, {"id": 726, "name": "drug abuse"}, {"id": 911, "name": "exotic island"}, {"id": 1319, "name": "east india trading company"}, {"id": 2038, "name": "love of one\'s life"}, {"id": 2052, "name": "traitor"}, {"id": 2580, "name": "shipwreck"}, {"id": 2660, "name": "strong woman"}, {"id": 3799, "name": "ship"}, {"id": 5740, "name": "alliance"}, {"id": 5941, "name": "calypso"}, {"id": 6155, "name": "afterlife"}, {"id": 6211, "name": "fighter"}, {"id": 12988, "name": "pirate"}, {"id": 157186, "name": "swashbuckler"}, {"id": 179430, "name": "aftercreditsstinger"}]'

In [23]:
movies['keywords'] =  movies['keywords'].apply(lambda x : [keyword['name'] for keyword in ast.literal_eval(x)])

In [24]:
movies.sample(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
3272,23570,The Pallbearer,Aspiring architect Tom Thompson is told by mys...,"[Comedy, Romance]","[independent film, mistaken identity]","[{""cast_id"": 1, ""character"": ""Tom Thompson"", ""...","[{""credit_id"": ""561e23c69251410d8f001946"", ""de..."
693,210577,Gone Girl,With his wife's disappearance having become th...,"[Mystery, Thriller, Drama]","[based on novel, marriage crisis, disappearanc...","[{""cast_id"": 7, ""character"": ""Nick Dunne"", ""cr...","[{""credit_id"": ""56d1e21e9251413e5f00981a"", ""de..."


### cast

In [25]:
movies.iloc[1].cast

'[{"cast_id": 4, "character": "Captain Jack Sparrow", "credit_id": "52fe4232c3a36847f800b50d", "gender": 2, "id": 85, "name": "Johnny Depp", "order": 0}, {"cast_id": 5, "character": "Will Turner", "credit_id": "52fe4232c3a36847f800b511", "gender": 2, "id": 114, "name": "Orlando Bloom", "order": 1}, {"cast_id": 6, "character": "Elizabeth Swann", "credit_id": "52fe4232c3a36847f800b515", "gender": 1, "id": 116, "name": "Keira Knightley", "order": 2}, {"cast_id": 12, "character": "William \\"Bootstrap Bill\\" Turner", "credit_id": "52fe4232c3a36847f800b52d", "gender": 2, "id": 1640, "name": "Stellan Skarsg\\u00e5rd", "order": 3}, {"cast_id": 10, "character": "Captain Sao Feng", "credit_id": "52fe4232c3a36847f800b525", "gender": 2, "id": 1619, "name": "Chow Yun-fat", "order": 4}, {"cast_id": 9, "character": "Captain Davy Jones", "credit_id": "52fe4232c3a36847f800b521", "gender": 2, "id": 2440, "name": "Bill Nighy", "order": 5}, {"cast_id": 7, "character": "Captain Hector Barbossa", "credit_

In [26]:
movies['cast'] =  movies['cast'].apply(lambda x: [name['name'] for name in ast.literal_eval(x)][:5])

In [27]:
movies.sample(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
1333,4944,Burn After Reading,When a disc containing memoirs of a former CIA...,"[Comedy, Drama]","[blackmail, paranoia, fitness-training, plasti...","[George Clooney, Frances McDormand, Brad Pitt,...","[{""credit_id"": ""569a8307c3a36872bb000465"", ""de..."
3505,35944,Lucky Break,Half-way through his 12-year prison sentence f...,"[Action, Comedy]",[],"[James Nesbitt, Olivia Williams, Timothy Spall...","[{""credit_id"": ""578d08adc3a3686efa002328"", ""de..."
2820,10611,Barbershop,A day in the life of a barbershop on the south...,"[Comedy, Drama]","[hairdresser, shop, neighbor, debt, meeting, h...","[Ice Cube, Anthony Anderson, Cedric the Entert...","[{""credit_id"": ""52fe43939251416c75015c7f"", ""de..."


### crew

In [28]:
movies.iloc[1].crew

'[{"credit_id": "52fe4232c3a36847f800b579", "department": "Camera", "gender": 2, "id": 120, "job": "Director of Photography", "name": "Dariusz Wolski"}, {"credit_id": "52fe4232c3a36847f800b4fd", "department": "Directing", "gender": 2, "id": 1704, "job": "Director", "name": "Gore Verbinski"}, {"credit_id": "52fe4232c3a36847f800b54f", "department": "Production", "gender": 2, "id": 770, "job": "Producer", "name": "Jerry Bruckheimer"}, {"credit_id": "52fe4232c3a36847f800b503", "department": "Writing", "gender": 2, "id": 1705, "job": "Screenplay", "name": "Ted Elliott"}, {"credit_id": "52fe4232c3a36847f800b509", "department": "Writing", "gender": 2, "id": 1706, "job": "Screenplay", "name": "Terry Rossio"}, {"credit_id": "52fe4232c3a36847f800b57f", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "52fe4232c3a36847f800b585", "department": "Editing", "gender": 2, "id": 1722, "job": "Editor", "name": "Craig Wood"}, {"credit_id": "52f

In [29]:
movies['crew'] = movies['crew'].apply(lambda x: [name['name'] for name in ast.literal_eval(x) if name['job'] == 'Director'])

In [30]:
movies.sample(3)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
3768,6537,The Orphanage,A woman brings her family back to her childhoo...,"[Horror, Drama, Thriller]","[schizophrenia, suppressed past, wife]","[Belén Rueda, Fernando Cayo, Roger Príncep, Ma...",[Juan Antonio Bayona]
3177,6521,The Contender,Political thriller about Laine Hanson's nomina...,"[Drama, Thriller]","[politics, suspense]","[Gary Oldman, Joan Allen, Jeff Bridges, Christ...",[Rod Lurie]
995,9923,Domino,"The daughter of actor, Laurence Harvey turns a...","[Action, Crime]","[bounty hunter, fbi, weapon, spectacle]","[Keira Knightley, Mickey Rourke, Edgar Ramírez...",[Tony Scott]


### overview 

- We can't merge a string column with a list column, so we need to convert the overview column from a string to a list.

In [31]:
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [39]:
movies['genres'] = movies['genres'].apply(lambda x : [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x : [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x : [i.replace(" ","") for i in x])

##### Now, merge the overview, genres, keywords, cast, and crew columns to create a tag column.

In [40]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [42]:
movies.drop(columns=['overview','genres','keywords','cast','crew'],inplace=True)

In [43]:
movies

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [45]:
movies['tags'] = movies['tags'].apply(lambda x : ' '.join(x))

In [46]:
movies

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."
...,...,...,...
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic..."
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...


#### Now, we use PorterStemmer to get words in their root form.

In [56]:
from nltk.stem  import PorterStemmer
ps = PorterStemmer()

In [60]:
def word_stem(tag): 
    L = []
    for i in tag.split():
        stem_word = ps.stem(i)
        L.append(stem_word)
    
    return " ".join(L)

In [62]:
movies['tags'] = movies['tags'].apply(word_stem)

In [63]:
movies.sample(3)

Unnamed: 0,movie_id,title,tags
3863,252360,In the Name of the King III,"hazen kaine, an american contract killer live ..."
1625,11137,The Prince & Me,a fairi tale love-stori about pre-m student pa...
1904,18147,Unaccompanied Minors,five dispar kid snow in at the airport on chri...


#### Now, we use CountVectorizer to convert text into a matrix of token counts.

In [64]:
from sklearn.feature_extraction.text import CountVectorizer

In [65]:
cv = CountVectorizer(stop_words='english',max_features=5000)

In [68]:
vectors = cv.fit_transform(movies['tags']).toarray()

In [71]:
vectors[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [73]:
vectors.shape

(4806, 5000)

#### Now, we use cosine_similarity to check the angle between the vectors we obtained previously, which helps in recommending movies.

In [74]:
from sklearn.metrics.pairwise import cosine_similarity

In [75]:
CS = cosine_similarity(vectors)

In [95]:
movies_list = sorted(list(enumerate(CS[0])),reverse=True,key= lambda x : x[1])[1:6]

In [None]:
movies.iloc[1216].title

In [151]:
for i in movies_list:
    print(movies.iloc[i[0]].title)
    print(i)
    print(movies.iloc[i[0]].movie_id)

Aliens vs Predator: Requiem
(1216, 0.2926585541394632)
440
Independence Day
(507, 0.26130213378560535)
602
Falcon Rising
(3730, 0.25732511773283273)
270938
Battle: Los Angeles
(582, 0.2484013136974297)
44943
Titan A.E.
(539, 0.24659848095803594)
7450


In [118]:
def recommend_movie(movie_name):
    index = movies[movies['title'] == movie_name].index[0]
    movies_list = sorted(list(enumerate(CS[index])),reverse=True,key= lambda x : x[1])[1:6]
    for title in movies_list:
        print(movies.iloc[title[0]].title)

In [128]:
recommend_movie('Hulk')

Godzilla 2000
Transformers: Age of Extinction
The Incredible Hulk
The Island of Dr. Moreau
Super


In [129]:
import pickle 

In [131]:
pickle.dump(movies,open('movies.pkl','wb'))

In [139]:
movies = pickle.load(open('movies.pkl','rb'))

In [144]:
pickle.dump(CS,open('CS.pkl','wb'))