# This Recommendation system is a content based recommendation system

### Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Data Collection and Pre-Processing

In [2]:
#Loading the data set into a pandas dataframe movies_data
movies_data = pd.read_csv('movies.csv')

In [3]:
#Looking at the first 2 rows and understanding the data with the help of column names
movies_data.head(2)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski


In [4]:
#Identifying number of rows and columns in the dataframe
movies_data.shape

(4803, 24)

In [5]:
#Selecting the relevant features for recommendation which is known as feature selection
selected_features = ['genres','keywords','tagline','cast','director']

In [6]:
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [7]:
#Replacing null values with null strings in the selected_features
for feature in selected_features:
    movies_data[feature] = movies_data[feature].fillna('')

In [8]:
#Combining all the selected 5 features
combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']

In [9]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [10]:
#Converting the text data to feature vectors / numerical values with the help of TfidfVectorizer
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)

In [11]:
print(feature_vectors)

  (0, 2432)	0.17272411194153
  (0, 7755)	0.1128035714854756
  (0, 13024)	0.1942362060108871
  (0, 10229)	0.16058685400095302
  (0, 8756)	0.22709015857011816
  (0, 14608)	0.15150672398763912
  (0, 16668)	0.19843263965100372
  (0, 14064)	0.20596090415084142
  (0, 13319)	0.2177470539412484
  (0, 17290)	0.20197912553916567
  (0, 17007)	0.23643326319898797
  (0, 13349)	0.15021264094167086
  (0, 11503)	0.27211310056983656
  (0, 11192)	0.09049319826481456
  (0, 16998)	0.1282126322850579
  (0, 15261)	0.07095833561276566
  (0, 4945)	0.24025852494110758
  (0, 14271)	0.21392179219912877
  (0, 3225)	0.24960162956997736
  (0, 16587)	0.12549432354918996
  (0, 14378)	0.33962752210959823
  (0, 5836)	0.1646750903586285
  (0, 3065)	0.22208377802661425
  (0, 3678)	0.21392179219912877
  (0, 5437)	0.1036413987316636
  :	:
  (4801, 17266)	0.2886098184932947
  (4801, 4835)	0.24713765026963996
  (4801, 403)	0.17727585190343226
  (4801, 6935)	0.2886098184932947
  (4801, 11663)	0.21557500762727902
  (4801, 1672

In [12]:
#Getting the similarity scores with the help of Cosine Similarity
similarity = cosine_similarity(feature_vectors)

In [13]:
print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [14]:
print(similarity.shape)

(4803, 4803)


In [15]:
# Getting the movie names from the user
movie_names = []
for i in range(3):
    movie_name = input(f'Enter your {i+1} favorite movie name: ')
    movie_names.append(movie_name)

Enter your 1 favorite movie name: Interstellar
Enter your 2 favorite movie name: Moon Fall
Enter your 3 favorite movie name: Avatar


In [16]:
# Creating a list with all the movie names given in the dataset
list_of_all_titles = movies_data['title'].tolist()

In [17]:
# Finding the close matches for the movie names given by the user with the help of difflib
find_close_matches = [difflib.get_close_matches(name, list_of_all_titles)[0] for name in movie_names]

In [18]:
find_close_matches

['Interstellar', 'Moneyball', 'Avatar']

In [19]:
# Finding the indices of the movies with the titles
indices_of_movies = [movies_data[movies_data.title == close_match]['index'].values[0] for close_match in find_close_matches]

In [20]:
indices_of_movies

[95, 928, 0]

In [21]:
# Getting the similarity scores of the entered movies with all other movies
similarity_scores = [list(enumerate(similarity[index])) for index in indices_of_movies]

In [22]:
# Sorting the movies based on their average similarity scores
average_similarity_scores = np.mean(np.array(similarity_scores), axis=0)
sorted_similar_movies = sorted(average_similarity_scores, key=lambda x: x[1], reverse=True)

In [23]:
# Printing the name of similar movies based on the index
print('Movies suggested for you: \n')
i = 1
for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index == index]['title'].values[0]

    if i < 20:
        print(i, '.', title_from_index)
        i = i + 1


Movies suggested for you: 

1 . Interstellar
2 . Avatar
3 . Moneyball
4 . Guardians of the Galaxy
5 . Megamind
6 . Aliens
7 . Star Trek Into Darkness
8 . Contact
9 . The Terminator
10 . Terminator Salvation
11 . Alien
12 . The Savages
13 . The Martian
14 . Terminator Genisys
15 . The Helix... Loaded
16 . Legends of the Fall
17 . Star Trek Beyond
18 . The Matrix
19 . The Ice Storm
