In [4]:
import numpy as np  
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [5]:
movies = pd.read_csv('tmdb_5000_movies.csv') 
credits = pd.read_csv('tmdb_5000_credits.csv')  

In [6]:
movies = movies.merge(credits, on='title')

In [7]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]


In [8]:
movies.dropna(inplace=True)


In [9]:
def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]

In [10]:
def fetch_director(text):
    return [i['name'] for i in ast.literal_eval(text) if i['job'] == 'Director']

In [11]:
def collapse(L):
    return [i.replace(" ", "") for i in L]

In [12]:
movies['genres'] = movies['genres'].apply(convert).apply(collapse)
movies['keywords'] = movies['keywords'].apply(convert).apply(collapse)
movies['cast'] = movies['cast'].apply(convert).apply(lambda x: x[:3]).apply(collapse)
movies['crew'] = movies['crew'].apply(fetch_director).apply(collapse)
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [13]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [14]:
new = movies[['movie_id', 'title']]
new['tags'] = movies['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['tags'] = movies['tags'].apply(lambda x: " ".join(x))


In [15]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()

In [16]:
similarity = cosine_similarity(vector)


In [17]:
pickle.dump(new, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [18]:
def recommend(movie, top_n=5):
    try:
        index = new[new['title'].str.lower() == movie.lower()].index[0]
        distances = sorted(
            list(enumerate(similarity[index])),
            reverse=True,
            key=lambda x: x[1]
        )
        recommendations = [new.iloc[i[0]].title for i in distances[1:top_n + 1]]
        return recommendations
    except IndexError:
        return f"Movie '{movie}' not found in the dataset!"

In [19]:
recommendations = recommend('Gandhi', top_n=5)
print(recommendations)


['Gandhi, My Father', 'The Wind That Shakes the Barley', 'A Passage to India', 'Guiana 1838', 'Ramanujan']


In [20]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_vector = tfidf.fit_transform(new['tags']).toarray()
tfidf_similarity = cosine_similarity(tfidf_vector)

In [21]:
pickle.dump(tfidf_similarity, open('tfidf_similarity.pkl', 'wb'))