In [3]:
import numpy as np 
import pandas as pd 

In [4]:
movies = pd.read_csv("dataset/tmdb_5000_movies.csv")
credit = pd.read_csv("dataset/tmdb_5000_credits.csv")

In [5]:
movies = movies.merge(credit,on='title')

In [6]:
# the columns we have to take:
# genres
# id
# keywords
# overview 
# title 
# cast
# crew

movies = movies[['movie_id','title','genres','keywords','overview','cast','crew']]

In [7]:
movies.isnull().sum()

movie_id    0
title       0
genres      0
keywords    0
overview    3
cast        0
crew        0
dtype: int64

In [8]:
movies.dropna(inplace=True)

In [9]:
movies.duplicated().sum()

np.int64(0)

In [10]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [11]:
import ast

In [12]:
def convert(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

In [13]:
movies['genres'] = movies['genres'].apply(convert)

In [14]:
movies['keywords'] = movies['keywords'].apply(convert)

In [15]:
def convert3(obj):
    l = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            l.append(i['name'])
            counter += 1
        else:
            break
    return l

In [16]:
movies['cast'] = movies['cast'].apply(convert3)

In [17]:
def convert4(obj):
    l = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director' :
            l.append(i['name'])
        else:
            continue

    return l


In [18]:
movies['crew'] = movies['crew'].apply(convert4)

In [19]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [20]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [21]:
movies['tags'] = movies['overview'] + movies['genres'] +  movies['keywords'] +  movies['cast'] +  movies['crew']

In [22]:
df = movies[['movie_id','title','tags']]

In [23]:
df['tags'] = df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x:" ".join(x))


In [24]:
df['tags'] = df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x:x.lower())


In [25]:
df.head() 

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [26]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [27]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return  " ".join(y)

In [28]:
df['tags'] = df['tags'].apply(stem)    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem)


In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [30]:
vectors = cv.fit_transform(df['tags']).toarray()

In [31]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      shape=(5000,), dtype=object)

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
similarity = cosine_similarity(vectors)

In [34]:
def recommend(movie):
    movie_index = df[df['title'] == movie].index[0]
    distance = similarity[movie_index]
    movie_list = sorted(list(enumerate(distance)),reverse=True,key= lambda x:x[1])[1:6]

    for i in movie_list:
        print(df.iloc[i[0]].title)


In [35]:
import pickle

In [38]:
pickle.dump(df,open('data.pkl','wb'))

In [39]:
pickle.dump(similarity,open('similarity.pkl','wb'))