#Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer  #CountVectorizer is used to convert a collection of text documents to a vector of token counts
from sklearn.metrics.pairwise import cosine_similarity

## Read CSV files

In [3]:
from google.colab import drive
drive.mount ('/content/drive')

Mounted at /content/drive


In [4]:
import os
work_dir = "/content/drive/My Drive/SKILLIT Courses/AI Level 2/Clustering/Movie_recommendation"
os.chdir(work_dir)

In [5]:
df = pd.read_csv("movie_dataset.csv")
print(df.columns)

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')


## Helper functions

In [6]:
def get_title_from_index(index):
  return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
  return df[df.title == title]["index"].values[0]

## Select Features

In [7]:
features = ["keywords", "cast", "genres", "director"]

for feature in features:
  df[feature] = df[feature].fillna("")

## Create column in df combining all selected features

In [8]:
def combine_features(row):
  try:
    text = row['keywords'] + ' ' + row['cast'] + ' ' + row['genres'] + ' ' + row['director']
    return text
  except:
    print("Error: ", row['index'])

In [9]:
df["combined_features"] = df.apply(combine_features, axis = 1) #df.apply helps to apply a function to all he rows

In [10]:
df['combined_features']

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6 Dan...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799     Edward Burns Kerry Bish\u00e9 Marsha Dietlein...
4800    date love at first sight narration investigati...
4801     Daniel Henney Eliza Coupe Bill Paxton Alan Ru...
4802    obsession camcorder crush dream girl Drew Barr...
Name: combined_features, Length: 4803, dtype: object

## Create count matrix from combined features

In [11]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])

## Compute the similarities between various movies

In [12]:
cosine_sim = cosine_similarity(count_matrix)

## Define movie finder function

In [13]:
def movie_finder():
  movie_user_likes = input("Which is your favorite movie? ")
  movie_index = get_index_from_title(movie_user_likes)
  similar_movies = list(enumerate(cosine_sim [movie_index]))
  sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse = True) #lambda creates inline function - refer PPT
  i=0
  print("You also might like these movies: ")
  for movie in sorted_similar_movies:
    print (get_title_from_index(movie[0]))
    i=i+1
    if i>5:
     break
     
movie_finder()

Which is your favorite movie? Batman
You also might like these movies: 
Batman
Batman Returns
Batman & Robin
The Dark Knight Rises
The Dark Knight
Batman Begins
