# Importing required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Reading the dataset

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [3]:
movies = movies.merge(credits, on = "title")

In [4]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [5]:
movies= movies[['movie_id', 'title', 'genres', 'original_language', 'keywords', 'overview', 'cast', 'crew']]
movies.shape

(4809, 8)

# Cleaning the dataset

In [6]:
movies.isnull().sum()

movie_id             0
title                0
genres               0
original_language    0
keywords             0
overview             3
cast                 0
crew                 0
dtype: int64

In [7]:
movies.dropna(inplace = True)
movies.isnull().sum()

movie_id             0
title                0
genres               0
original_language    0
keywords             0
overview             0
cast                 0
crew                 0
dtype: int64

In [8]:
#checking duplicate values
movies.duplicated().sum()

0

In [9]:
#converting string to list to clean columns

In [10]:
import ast

In [11]:
def convert(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
    return l

In [12]:
#cleaning genres
movies['genres']= movies['genres'].apply(convert)

In [13]:
#cleaning keywords
movies['keywords'] = movies['keywords'].apply(convert)

In [14]:
#cleaning cast 
def convert(text):
    l=[]
    counter = 0
    for i in ast.literal_eval(text):
        if counter<5:
            l.append(i['name'])
        counter+=1
    return l
movies['cast']=movies['cast'].apply(convert)

In [15]:
#cleaning crew
def convert(text):
    l=[]
    for i in ast.literal_eval(text):
        if i['job']=='Director':
            l.append(i['name'])
            break
    return l

In [16]:
movies['crew']=movies['crew'].apply(convert)

In [17]:
#cleaning overview
movies['overview']=movies['overview'].apply(lambda x: x.split())

In [18]:
movies.head()

Unnamed: 0,movie_id,title,genres,original_language,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",en,"[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",en,"[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]",en,"[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",en,"[dc comics, crime fighter, terrorist, secret i...","[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]",en,"[based on novel, mars, medallion, space travel...","[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [19]:
def removing_space(word):
    l = []
    for i in word:
        l.append(i.replace(" ",""))
    return l

In [20]:
movies['cast']=movies['cast'].apply(removing_space)
movies['crew']=movies['crew'].apply(removing_space)
movies['genres']=movies['genres'].apply(removing_space)
movies['keywords']=movies['keywords'].apply(removing_space)

In [21]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['cast'] + movies['crew'] + movies['keywords']

In [22]:
df = movies[['movie_id', 'title', 'tags', 'original_language']]

In [23]:
df['tags'] = df['tags'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: ' '.join(x))


In [24]:
df['tags'] = df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: x.lower())


# Model making 

In [25]:
import nltk
from nltk.stem import PorterStemmer

In [26]:
ps = PorterStemmer()

In [27]:
def stems(text):
    l=[]
    for i in text.split():
        l.append(ps.stem(i))
    return ' '.join(l)

In [28]:
df['tags'] = df['tags'].apply(stems)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stems)


In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [30]:
vector = cv.fit_transform(df['tags']).toarray()

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
similarity = cosine_similarity(vector)

In [33]:
similarity

array([[1.        , 0.08006408, 0.08492078, ..., 0.04441156, 0.        ,
        0.        ],
       [0.08006408, 1.        , 0.05892557, ..., 0.02311251, 0.        ,
        0.02541643],
       [0.08492078, 0.05892557, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04441156, 0.02311251, 0.02451452, ..., 1.        , 0.0418121 ,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.0418121 , 1.        ,
        0.0919601 ],
       [0.        , 0.02541643, 0.        , ..., 0.04229549, 0.0919601 ,
        1.        ]])

In [34]:
df[df["title"] == "Spider-Man"].index[0]

159

In [36]:
def recommend(movie):
    index = df[df["title"] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key= lambda x: x[1])
    for i in distances[1:6]:
        print(df.iloc[i[0]].title)

In [37]:
recommend("Spider-Man")

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
Arachnophobia
The Amazing Spider-Man


In [38]:
import pickle

In [39]:
pickle.dump(df, open("artifacts/movies_list.pkl", "wb"))
pickle.dump(similarity, open("artifacts/similarity.pkl", "wb"))