In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
movies_db = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/tmdb_5000_movies.csv')
credits_db = pd.read_csv('/content/drive/MyDrive/Movie Recommendation System/tmdb_5000_credits.csv')
movies_db.head(2)

In [None]:
credits_db.head(2)

In [None]:
# credits_db.head(1)['cast'].values

In [None]:
movies = movies_db.merge(credits_db, on="title")    # merging two dataframes, on basis on on="column"
movies.head(4)

In [None]:
# movies.shape

In [None]:
# movies_db.shape

In [None]:
# credits_db.shape

In [None]:
# movies_db['original_language'].value_counts()

In [None]:
# movies_db.info()

In [None]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [None]:
movies.head(3)

## **Preprocessing**

In [None]:
movies.isnull().sum()   # checking for the total missing data, and removing if very small amount of it are present

In [None]:
movies.isnull().sum().sum()  # total of missing values in the dataframe

In [None]:
movies.dropna(inplace=True) # removing the missing data (all data denoted by null)

In [None]:
movies.duplicated().sum()   # Checking for the total duplicated data

In [None]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

list of genres are in the string so we have to convert it into list first then only we can extract imp data from it.

## **ast module** has function ***literal_eval*** which does the exact job

In [None]:
import ast

#convert function will extract the tags from the genres or other features and return a list of tags
def convert(obj):
  l = []
  for i in ast.literal_eval(obj):
    l.append(i['name'])
  return l

In [None]:
# Applying convert function on genres to convert string to list
movies['genres'] = movies['genres'].apply(convert)
movies['genres']

In [None]:
# Applying convert function on keywords to convert string to list
movies['keywords'] = movies['keywords'].apply(convert)
movies['keywords']

In [None]:
import ast

# Now as we can see that in cast features, many unnecessary casts are also included, so we select only the first 3 cast name so to recommend movies based on the famous cast only and not the side cast
def convert_cast(obj):
  l = []
  count = 0
  for i in ast.literal_eval(obj):
    if count >= 3:
      break
    l.append(i['name'])
    count += 1
  return l

In [None]:
# Applying convert_cast on 'cast' keyword to extract only first 3 cast names
movies['cast'] = movies['cast'].apply(convert_cast)
movies['cast']

In [None]:
# Seeing the updated movies dataframe
movies.head(4)
# Check for the updated values in features 'genres', 'keywords' and 'cast'

In [None]:
# Now as we see in crew feature, all the crew members are given, but recommendation is done on the basis of the director only. 
movies['crew'][0]

In [None]:
# So we extract the name of the director under the key: job
import ast

def convert_crew(obj):
  l = []
  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      l.append(i['name'])
      break
  return l

In [None]:
# Applying convert_crew on crew feature to extract only the name of director
movies['crew'] = movies['crew'].apply(convert_crew)
movies['crew']

In [None]:
# Now we can see that the overview field is in form of string, so we have to convert to list to parse easily during recommendation
movies['overview'][0]

In [None]:
# Applying python's string split() function on each 'overview' feature's data
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['overview']

In [None]:
# Seeing the updated movies dataframe
movies.head(4)
# Check for the updated values in features 'crew', 'overview'

In [None]:
# We can see above that there is space in between the words in 'genres'(Science Fiction), 'keywords' (space war), 'cast' (Sam Worthington), 'crew' (James Cameron)
# This spaces can create a problem as if due to the first word matches like Sam Worthington and Sam Mendes, our model will get confused which movie to recommend
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [None]:
# Now changes are reflected in the genres, keywords, cast, crew (spaces between the words)
movies.head(3)

In [None]:
# Creating new column 'tags' and appending all the updated columns that will be further used together for recommendation
movies['tags'] = movies['genres'] + movies['keywords'] + movies['overview'] + movies['cast'] + movies['crew']

In [None]:
new_df = movies[['movie_id', 'title', 'tags']]
new_df.head(3)

In [None]:
# Converting the given tags list into a single string with spaces using join() function on string
new_df['tags'] = new_df['tags'].apply(lambda x : " ".join(x))
new_df['tags']

In [None]:
# to define only one common function to recommend, it is necessary to match the case of both the arguments, so we lower all the values.
new_df['tags'] = new_df['tags'].apply(lambda x : x.lower())
new_df['tags']

In [None]:
# There are chances that one expression is represented by many different ways like expression 'love' can be also stated as 'loving', 'lovable', etc.
# This will consider all the expressions as different expressions. Hence we have to make all those similar expressions as one expression only.
# For this we use inbuilt function PorterStemmer() from nltk.stem.porter
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
# This function converts all the same meaning different expression into one expression
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [None]:
# Applying stem function on 'tags' column 
new_df['tags'] = new_df['tags'].apply(stem)
new_df['tags']

## **Prediction**

In [None]:
# We have lots of tags for each movie (lets say here we have almost 100000 tags including unnecessary words). For good recommendation, we need to consider only the words having more frequency of occurrence in the particular tag.
# We use CountVectorizer() function that will count all the words and gives its frequency and returns only the most frequently occured words in form of vector.
# Stop_words are those words in the mentioned language that doesn't contribute to the prediction of the model like in english we have 'are', 'or', 'a', 'the', 'and', 'because', etc. Hence we have to remove all those stop_words.
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [None]:
# We do fit_transform on cv object and cv returns the Sparse matrix, so we also convert that into the array.
# fit_transform used on the training data so that we can scale the training data and also learn the scaling parameters of that data. The model built by us will learn the mean and the variance of the features of the training set.
vectors = cv.fit_transform(new_df['tags']).toarray()
vectors

In [None]:
# Returns an array of all the 5000 features(columns) and the vectors are the values below this features. Which features has what count numbers 
cv.get_feature_names_out()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# To deal with high dimensional data, we don't use Euclidean distance as it is not appropriate.
# Hence, we use consine distance that is if the angle between the two movie vectors is small, means much more similarity as cos0 = 1 (almost similar) and if more angle, less or no similarity (cos90 = 0)
# cosine_similarity returns the array of similarities values (between 0 to 1)
similarities = cosine_similarity(vectors)
similarities.shape
similarities[0]   # represents the similarity values (between 0 to 1) of one movie with all the other movies

In [None]:
new_df.head(3)

In [None]:
# How to find the index of the particular movie
# new_df[new_df['title'] == 'Avatar'].movie_id[0]
# new_df[new_df['title'] == 'Avatar'].index[0]

In [None]:
# As the similarities array doesn't have fixed index due to which we can map movie with other movies in sequence, and because of it, if we try to sort the array, which movie is more similar to other movies data is lost 
# To maintain the movie's similarities with other movies with indices, we use enumerate() function, so that even after sorting, movie index is maintained for the particular movie
# We then use sorted() function. But we need to fetch more similar movies to the particular movie from first, hence we perform reverse sorting
# Here key means which column's reference needs to be taken while sorting, by default it will sort the first column (0th indexed), here we sorted second column (1st indexed)
# sorted(list(enumerate(similarities[0])), reverse=True, key = lambda x:x[1])[1:6]

In [None]:
# Now we will recommend movie based on the tags
def recommend(movie):
  movie_index = new_df[new_df['title'] == movie].index[0]
  distances = similarities[movie_index]
  movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

  for i in movies_list:
    print(new_df.iloc[i[0]].title)

In [None]:
recommend('Avatar')

In [None]:
import pickle
pickle.dump(new_df.to_dict(), open('movie_dict.pkl', 'wb'))

In [None]:
pickle.dump(similarities, open('similarity.pkl', 'wb'))

In [None]:
new_df['title'].values