<a href="https://colab.research.google.com/github/VidyasriAsarla/Mini_project/blob/main/MovieRecommenderSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import files
# uploaded = files.upload()
df1 = pd.read_csv("tmdb_5000_movies.csv")
df2 = pd.read_csv("tmdb_5000_credits.csv")


**Data Preprocessing Starts**

In [None]:
df1.head()

In [None]:
df2.head()


In [None]:
df = df1.merge(df2, on = "title")
df.head()

In [None]:
df.shape

In [None]:
df["original_language"].value_counts()

In [None]:
df.info()

In [None]:
# Selecting features
features = ["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]

df = df[features]

In [None]:
# Removing rows with nan values
df.dropna(inplace = True)

In [None]:
df.isnull().sum()

In [None]:
# Checking for duplicate rows
df.duplicated().sum()

In [None]:
df.iloc[0].genres

In [None]:
import ast
def convert(obj):
  _list = []
  # ast.literal_eval converts string into dictionary or object
  for val in ast.literal_eval(obj):
    _list.append(val["name"])
  return _list

In [None]:
# Making a list of only genres and not including unwanted data
df["genres"] = df["genres"].apply(convert)

In [None]:
# Making a list of only keywords and not including unwanted data
df["keywords"] = df["keywords"].apply(convert)

In [None]:
df["cast"][0]

In [None]:
# We want only most important actors in the cast
def get_top_actors(obj):
  _list = []
  count = 0
  for val in ast.literal_eval(obj):
    if count == 3:
      break
    _list.append(val["name"])
    count += 1
  return _list

In [None]:
# Making a list of only top 3 actors in the movie
df["cast"] = df["cast"].apply(get_top_actors)
df["cast"]

In [None]:
df["crew"][0]

In [None]:
# We want only director from the crew
def get_director(obj):
  _list = []
  count = 0
  for val in ast.literal_eval(obj):
    if val["job"] == "Director":
      _list.append(val["name"])
      break
  return _list

In [None]:
# Making a list of directors only from the crew
df["directors"] = df["crew"].apply(get_director)
df["directors"]

In [None]:
df["overview"][0]

In [None]:
# Converting string into list of overview
df["overview"] = df["overview"].apply(lambda x: x.split())
df["overview"]

In [None]:
df.head()

In [None]:
# Replacing spaces with empty string in all columns   (This helps model to differentiate between 2 different strings that have same first name)
df["genres"] = df["genres"].apply(lambda x : [i.replace(" ", "") for i in x])
df["keywords"] = df["keywords"].apply(lambda x : [i.replace(" ", "") for i in x])
df["cast"] = df["cast"].apply(lambda x : [i.replace(" ", "") for i in x])
df["directors"] = df["directors"].apply(lambda x : [i.replace(" ", "") for i in x])


In [None]:
# Making a new column tags that stores all the data of all the 4 columns
df["tags"] = df["overview"] + df["keywords"] + df["cast"] + df["directors"]
df["tags"]

In [None]:
# Creating a new dataframe of just 3 columns after doing so  much preprocessing
new_df = df[["movie_id", "title", "tags"]]
new_df["tags"][0]

In [None]:
# Converting list into string with spaces
new_df["tags"] = new_df["tags"].apply(lambda x : " ".join(x))
new_df.head()

In [None]:
# Making all the letters in the string to lowercase
new_df["tags"] = new_df["tags"].apply(lambda x : x.lower())

**Using Stemming technique to convert normal words into root words**

In [None]:
# As we can see similar words are considered as different words in the vectorizer, we will use stemming to remove this ambiguity
# stemming converts all the words to its root words
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [None]:
def stem(text):
  y = []
  for txt in text.split():
    y.append(ps.stem(text))
  return " ".join(y)

In [None]:
new_df["tags"] = new_df["tags"].apply(stem)

**Using CountVectorizer to convert strings into vectors**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Using CountVectorizer to make a matrix of count of different words

# max_features determines the maximum number of words a vector can have
# stop_words tells the object to ignore normal english words like and, to, be, ...

vectorizer = CountVectorizer(max_features = 5000, stop_words = "english")

count_matrix = vectorizer.fit_transform(new_df["tags"]).toarray()

In [None]:
vectorizer.get_feature_names()

**Calculating Cosine Similarity**

In [None]:
# As the dimension of data is higher, euclidean distance can't be used to get proper result. Thats why we use angular distance.

from sklearn.metrics.pairwise import cosine_similarity

# Calculating Cosine Similarity
cosine_sim = cosine_similarity(count_matrix)

cosine_sim.shape

**Recommending Movies**

In [None]:
# Helper Functions

def get_title_from_index(index):
    return new_df.loc[index, "title"]

def get_index_from_title(title):
    return new_df.loc[df.title == title].index[0]

def recommend(movie):
  # Getting movie index from dataframe
  movie_index = get_index_from_title(movie)
  # finding the vector with similarity values for that movie w.r.t all other movies
  distances = cosine_sim[movie_index]
  # sorting the similarity values in descending order along with their indices and taking top 5 movies
  movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x : x[1])[1:6]
  # Traversing the movies list
  for movie_name in movies_list:
    # Using the get_title_from_index function to get movie_name from the index in dataframe
    print(get_title_from_index(movie_name[0]))

In [None]:
recommend("Avatar")

In [None]:
import pickle

In [None]:
pickle.dump(new_df.to_dict(), open("movie_dict.pkl", "wb"))

In [None]:
pickle.dump(cosine_sim, open("cosine_similarity.pkl", "wb"))