In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
import numpy as np
import pandas as pd
import ast

In [None]:
movies = pd.read_csv('/content/drive/My Drive/Machine Learning Projects/Movie Recommendation System/tmdb_5000_movies.csv')
credits = pd.read_csv('/content/drive/My Drive/Machine Learning Projects/Movie Recommendation System/tmdb_5000_credits.csv')

# Preprocessing the data

In [None]:
movies.head(1)

In [None]:
credits.head(1)
# credits.head(1)["cast"].values

In [None]:
# Joining the two dataframes on Title

In [None]:
movies.merge(credits, on='title').shape

In [None]:
movies.shape

In [None]:
credits.shape

In [None]:
movies = movies.merge(credits, on='title')
movies.head(1)

## Remove irrelavant columns
> like budget is mostly irrelevant while genre can be crutial to recommend movies, it is a content based recommendation system, hence numerical data is
 not consided  
> Keep these columns --> [genres, id, keywords, overview, cast, crew]

In [None]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.head(3)

In [None]:
# check for missing data
movies.isnull().sum()  # 3 overvies missing remove them

In [None]:
movies.dropna(inplace=True)

In [None]:
# Check for duplicated data
movies.duplicated().sum()

In [None]:
movies.iloc[0].genres  # List of Dictionaries

> We want  
  [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]  
  in the following format  
  ['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [None]:
def convert(obj):  # the obj is a list in the form of a string --> unstring it --> ast.literal_eval
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L

In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies.head(2)  # observe the change in genres column

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)
movies.head(2)

In [None]:
# I only want the top 3 cast --> first 3 dictionaries

def convert3(obj):
  L = []
  counter = 0
  for i in ast.literal_eval(obj):
    if counter != 3:
      L.append(i['name'])
      counter += 1
    else:
      break
  return L

In [None]:
movies['cast'] = movies['cast'].apply(convert3)
movies.head(3)

In [None]:
# Same with crew now, but while only considering the director

def fetch_director(obj):
  L = []
  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      L.append(i['name'])
      break
  return L

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
movies.head(2)

In [None]:
# now convert the overview column into a list, which would help us in concatanation

movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies.head()

> Make Jhonny Depp --> JhonnyDepp. since there are people with same first names, we do not want Jhonny Depp and Jhonny Sins to be in the same tag. Hence, make it unique.

In [None]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(' ', '') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(' ', '') for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(' ', '') for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(' ', '') for i in x])

In [None]:
movies.head()

> Now make a tags column combiing everything

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] +movies['cast'] + movies['crew']

In [None]:
new_df = movies[['movie_id', 'title', 'tags']]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
new_df.head(1)

In [None]:
new_df['tags'][0]

In [None]:
# convert to lowercase

new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())
new_df.head(2)

# Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()
vectors.shape

In [None]:
vectors[0]

In [None]:
print(list(cv.get_feature_names_out()))

> We can see that we have action, actions which are similar  
> stem --> [loved, loving, love] --> [love, love, love]

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))

  return " ".join(y)

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
cv.get_feature_names_out()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)  # 4806 x 4806

In [None]:
similarity  # correalation matrix


# Recommendar System

In [None]:
def recommend(movie):
  movie_index = new_df[new_df['title'] == movie].index[0]  # get the index of the movie
  distances = similarity[movie_index]  # find the similarity vecotr of the movie
  movies_list = sorted(list(enumerate(distances)), reverse= True, key=lambda x: x[1])[1:6]  # sort the distances, enumerate so we can maintain the id's while sorting, sort on descending order, take the 2,3,4,5,6 elements
  for i in movies_list:
    print(new_df.iloc[i[0]].title)

In [None]:
recommend("Batman Begins")

In [None]:
new_df.iloc[1192].title

# Frontend

In [None]:
import pickle

In [None]:
file_path = '/content/drive/My Drive/Machine Learning Projects/Movie Recommendation System/movies_dict.pkl'

pickle.dump(new_df.to_dict(), open(file_path, 'wb'))

In [None]:
file_path = '/content/drive/My Drive/Machine Learning Projects/Movie Recommendation System/similarity.pkl'

pickle.dump(similarity, open(file_path, 'wb'))