In [1]:
import numpy as np
import pandas as pd
import ast
import nltk
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

# Loading the Dataset

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# movies data

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


# credits data

In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# merging credits and movies on basis of title

In [5]:
movies = movies.merge(credits, on='title')

In [6]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
movies = movies.head(500)

In [8]:
movies.shape

(500, 23)

In [9]:
# genres
# id
# keywords
# title
# overview
# cast
# crew

movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords','cast', 'crew']]

# filtered data

In [10]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# data pre-processing

# missing data


In [11]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [12]:
movies.dropna(inplace=True)

# checking duplicate data

In [13]:
movies.duplicated().sum()

0

# applying preprocessing

In [14]:
def convert_to_list(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [16]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [17]:
def f(items, movies):
    for col in items:
        movies[col] = movies[col].apply(convert_to_list)
    return movies
    

In [18]:
change = ['cast','genres','keywords']
movies = f(change, movies)

In [19]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [20]:
movies['cast']

0      [Sam Worthington, Zoe Saldana, Sigourney Weave...
1      [Johnny Depp, Orlando Bloom, Keira Knightley, ...
2      [Daniel Craig, Christoph Waltz, Léa Seydoux, R...
3      [Christian Bale, Michael Caine, Gary Oldman, A...
4      [Taylor Kitsch, Lynn Collins, Samantha Morton,...
                             ...                        
495    [Dwayne Johnson, Josh Hutcherson, Kristin Davi...
496    [Bill Hader, Anna Faris, James Caan, Will Fort...
497    [Anthony Hopkins, Edward Norton, Ralph Fiennes...
498    [Viggo Mortensen, Zuleikha Robinson, Omar Shar...
499    [Adam Sandler, Katie Holmes, Al Pacino, Eugeni...
Name: cast, Length: 500, dtype: object

In [21]:
def trunc(obj):
    newObj = obj[1:3]
    return newObj
    

In [22]:
 movies['cast'] = movies['cast'].apply(trunc)

In [None]:
#function to fetch the movie_director

In [23]:
def get_director(obj):
    director = []
    for item in ast.literal_eval(obj):
        if item['job'] == 'Director':
            director.append(item['name'])
            break
    return director
    

In [24]:
movies['crew'] = movies['crew'].apply(get_director)

In [25]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [26]:
def split_theory(obj):
    return obj.split()

In [27]:
movies['overview'] = movies['overview'].apply(split_theory)

In [28]:
movies['overview']

0      [In, the, 22nd, century,, a, paraplegic, Marin...
1      [Captain, Barbossa,, long, believed, to, be, d...
2      [A, cryptic, message, from, Bond’s, past, send...
3      [Following, the, death, of, District, Attorney...
4      [John, Carter, is, a, war-weary,, former, mili...
                             ...                        
495    [Sean, Anderson, partners, with, his, mom's, b...
496    [After, the, disastrous, food, storm, in, the,...
497    [Former, FBI, Agent, Will, Graham,, who, was, ...
498    [Set, in, 1890,, this, is, the, story, of, a, ...
499    [Jack, Sadelstein,, a, successful, advertising...
Name: overview, Length: 500, dtype: object

In [29]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [None]:
#function to format the words used in formation of tags

In [30]:
def formatting_tagWords(obj, movies):
    for col in obj:
        movies[col] = movies[col].apply(lambda x:[i.replace(" ","") for i in x])
    return movies

In [31]:
movies = formatting_tagWords(['genres', 'keywords', 'cast', 'crew'], movies)

In [34]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [None]:
#finally created new-movie-df with movie-id, title, tags as cols

In [35]:
new_movie_df = movies[['movie_id', 'title', 'tags']]

In [36]:

new_movie_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [37]:
import warnings
warnings.filterwarnings('ignore')
warnings.warn('DelftStack')
warnings.warn('Do not show this message')

new_movie_df['tags'] = new_movie_df['tags'].apply(lambda x: " ".join(x))

In [38]:
new_movie_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."


In [39]:
new_movie_df['tags'] = new_movie_df['tags'].apply(lambda x: x.lower())

In [40]:
new_movie_df['tags'].head(1)


0    in the 22nd century, a paraplegic marine is di...
Name: tags, dtype: object

In [41]:

cv = CountVectorizer(max_features=5000, stop_words = 'english')

In [42]:
vectors = cv.fit_transform(new_movie_df['tags']).toarray()

In [43]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [45]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [46]:

ps = PorterStemmer()

In [None]:
#stemming words

In [47]:
def stem(text):
    newList = []
    for word in text.split():
        newList.append(ps.stem(word))
    stemmed_string = " ".join(newList)
    return stemmed_string

In [48]:
new_movie_df['tags'] = new_movie_df['tags'].apply(stem)

## applying consine-similarity

In [50]:
similarity_matrix = cosine_similarity(vectors)

In [51]:
def recommend_Movies(movie):
    index = new_movie_df[new_movie_df['title'] == movie].index[0]
    set_limit = 6
    distances = sorted(list(enumerate(similarity_matrix[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:set_limit]:
        print(new_movie_df.iloc[i[0]].title)
        

In [52]:
recommend_Movies('Avatar')

Ender's Game
Jupiter Ascending
Edge of Tomorrow
Journey 2: The Mysterious Island
The Lovers


In [54]:
pickle.dump(new_movie_df, open('movies.pkl', 'wb'))

In [55]:
pickle.dump(similarity_matrix, open('similarity_matrix.pkl', 'wb'))

In [56]:
pickle.dump(new_movie_df.to_dict(), open('movie_dict_format.pkl', 'wb'))