In [3]:
import numpy as np
import pandas as pd

In [4]:
import seaborn as sns

In [5]:
df1 = pd.read_csv('tmdb_5000_movies.csv')
df2 = pd.read_csv('tmdb_5000_credits.csv')

In [6]:
df1.shape

(4803, 20)

In [7]:
df2.shape

(4803, 4)

In [8]:
df=pd.merge(df1, df2, how='outer',copy='None',on='title')

In [9]:
df.shape

(4809, 23)

In [10]:
df.iloc[1,:]

budget                                                          300000000
genres                  [{"id": 12, "name": "Adventure"}, {"id": 14, "...
homepage                     http://disney.go.com/disneypictures/pirates/
id                                                                    285
keywords                [{"id": 270, "name": "ocean"}, {"id": 726, "na...
original_language                                                      en
original_title                   Pirates of the Caribbean: At World's End
overview                Captain Barbossa, long believed to be dead, ha...
popularity                                                     139.082615
production_companies    [{"name": "Walt Disney Pictures", "id": 2}, {"...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2007-05-19
revenue                                                         961000000
runtime                               

## Irrelevant Features

In [11]:
#budget #homepage #original_language #original_title #popularity #production_companies #production_countries
#release_date #revenue #runtime #spoken_languages #status #Released #tagline #vote_average #vote_count #movie_id

In [12]:
df.drop(['budget' ,'homepage' ,'original_language' ,'original_title' ,'popularity' ,'production_companies' ,'production_countries', 'release_date', 'revenue', 'runtime' ,'spoken_languages' ,'status' ,'status' ,'tagline' ,'vote_average' ,'vote_count' ,'movie_id'],inplace=True, axis=1)

In [13]:
df.iloc[0,-1]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

#### Missing Data

In [14]:
df.isnull().sum()

genres      0
id          0
keywords    0
overview    3
title       0
cast        0
crew        0
dtype: int64

### Droping The missing rows

In [15]:
df = df.dropna()

In [16]:
df.isnull().sum()

genres      0
id          0
keywords    0
overview    0
title       0
cast        0
crew        0
dtype: int64

### Duplicate Data

In [17]:
df.duplicated().sum()

0

## Data Pre Processing

### genres

In [18]:
import ast

In [19]:
def convert(obj):
    l=[]
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l;

In [20]:
df['genres'] = df['genres'].apply(convert)

In [21]:
df['genres'] = df['genres'].apply(lambda x:[i.replace(" ","") for i in x])

### keywords	

In [22]:
df['keywords'] = df['keywords'].apply(convert)

In [23]:
df['keywords'] = df['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

### Cast

In [24]:
def top3(obj):
    l=[]
    j=0
    for i in ast.literal_eval(obj):
        if(j==3):
            break
        l.append(i['name'])
        j=j+1
    return l;

In [25]:
df['cast']=df['cast'].apply(top3)

In [26]:
df['cast'] = df['cast'].apply(lambda x:[i.replace(" ","") for i in x])

In [27]:
df.iloc[10,-2]

['BrandonRouth', 'KevinSpacey', 'KateBosworth']

### crew

In [28]:
def fetch_director(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if(i['job']=='Director'):
            l.append(i['name'])
            break
    return l;

In [29]:
df['crew']=df['crew'].apply(fetch_director)

In [30]:
df['crew'] = df['crew'].apply(lambda x:[i.replace(" ","") for i in x])

### overview

In [31]:
df['overview'] = df['overview'].apply(lambda x:x.split())

### Creating new column named Tags

In [32]:
df['tags']=df['overview']+df['keywords']+df['genres']+df['cast']+df['crew']

In [33]:
df.drop(['overview','crew' ,'cast' ,'keywords' ,'genres'],inplace=True,axis=1)

In [34]:
df['tags']=df['tags'].apply(lambda x:" ".join(x))

In [35]:
df['tags']=df['tags'].apply(lambda x:x.lower())

In [36]:
df.iloc[0,-1]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d action adventure fantasy sciencefiction samworthington zoesaldana sigourneyweaver jamescameron'

## Steming

In [37]:
import nltk

In [38]:
from nltk.stem.porter import PorterStemmer

In [39]:
ps=PorterStemmer()

In [40]:
def stem(str):
    y=[]
    for i in str.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [41]:
df['tags'] = df['tags'].apply(stem)

## vectorization text (Bag of words)

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
cv = CountVectorizer(max_features=5000,stop_words='english')

In [44]:
vectors = cv.fit_transform(df['tags']).toarray()

In [45]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Model Development

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

In [47]:
similarity =cosine_similarity(vectors)

In [71]:
def recommmend(str):
    index = df[df['title']==str].index[0]
    distances=similarity[index]
    movie_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in movie_list:
        print(df.iloc[i[0]].title)

In [72]:
recommmend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [73]:
df

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


# Deployment

In [50]:
import pickle

In [51]:
pickle.dump(df,open('movies.pkl','wb'))

In [54]:
pickle.dump(similarity,open('similarity.pkl','wb'))