In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
movies = pd.read_csv(r'tmdb_5000_movies.csv')
credits = pd.read_csv(r'tmdb_5000_credits.csv', on_bad_lines='warn', engine='python')

In [None]:
credits.head()

In [None]:
movies = movies.merge(credits, on='title')

In [None]:
movies.shape

In [None]:
credits.shape

In [None]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [None]:
movies.head()

In [None]:
import ast

def convert(obj):
  names = []
  for i in ast.literal_eval(obj):
    names.append(i['name'])
  return names

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv', on_bad_lines='warn', engine='python')
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.dropna(inplace=True)

# Corrected convert function
def convert(obj):
  names = []
  for i in ast.literal_eval(obj):
    names.append(i['name'])
  return names

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
movies.head()

In [None]:
movies.dropna(inplace=True)

In [None]:
def convert3(obj):
  L = []
  counter = 0
  for i in ast.literal_eval(obj):
    if counter != 3:
      L.append(i['name'])
      counter += 1
    else:
      break
  return L

In [None]:
movies['cast'] = movies['cast'].apply(convert3)

In [None]:
def fetch_director(obj):
  names = []
  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      names.append(i['name'])
      break
  return names

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [None]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
new_df = movies[['movie_id', 'title', 'tags']]

In [None]:
new_df.head()

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

In [None]:
import nltk

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words= 'english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
cv.get_feature_names_out()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [None]:
similarity

In [None]:
def recommend(movie):
  movie_index = new_df[new_df['title'] == movie].index[0]
  distances = similarity[movie_index]
  movies_list = sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:6]
  for i in movies_list:
    print(new_df.iloc[i[0]].title)


In [None]:
recommend("Avatar")

In [None]:
import pickle

pickle.dump(new_df.to_dict(), open('movies_df.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

print("new_df and similarity matrix saved successfully as 'movies_df.pkl' and 'similarity.pkl' respectively.")