In [1]:
# Importing necessary libraries
import pandas as pd

In [2]:
# Reading the dataset
titles = pd.read_csv('Data/titles.csv')

# Creating a column "index" in the dataset that corresponds to the number of rows in the dataset
titles['index'] = titles.index

In [3]:
# Showing the first 5 rows in the dataset
titles.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,index
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,48,['documentation'],['US'],1.0,,,,0.6,,0
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,27.612,8.2,1
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,18.216,7.8,2
3,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,17.505,7.8,3
4,tm190788,The Exorcist,MOVIE,12-year-old Regan MacNeil begins to adapt an e...,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,95.337,7.7,4


In [4]:
# The 'description' and 'genres' column is selected to be used for the recommendation of movies
features = ['title', 'description', 'genres', 'index']

In [5]:
# Pre-processing the 'genres' column to make it useful
titles['genres'] = titles['genres'].apply(lambda x:x.strip("[]"))
titles['genres'] = titles['genres'].apply(lambda x:x.replace("'", ''))

In [6]:
new_title = titles[features]
new_title.head(3)

Unnamed: 0,title,description,genres,index
0,Five Came Back: The Reference Films,This collection includes 12 World War II-era p...,documentation,0
1,Taxi Driver,A mentally unstable Vietnam War veteran works ...,"crime, drama",1
2,Monty Python and the Holy Grail,"King Arthur, accompanied by his squire, recrui...","comedy, fantasy",2


In [7]:
# Filling the missing values in the features required ('description' and 'genres')
new_title['description'].fillna('', inplace=True);
new_title['genres'].fillna('', inplace=True);

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [8]:
# Function to combine the texts in the two features into a single text
def combine_features(feature):
    return feature['description'] + " " + feature['genres']

In [9]:
# Creating a column to store the combined text of the 'description and genres' column
new_title['combined features'] = new_title.apply(combine_features, axis = 1);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_title['combined features'] = new_title.apply(combine_features, axis = 1);


In [10]:
# Importing CountVectorizer to put a text into a matrix
from sklearn.feature_extraction.text import CountVectorizer

# Importing 'cosine_similarity' method to create a cosine similarity between several texts
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
cv = CountVectorizer(stop_words='english')

# Putting the combined text into a matrix
title_matrix = cv.fit_transform(new_title['combined features'])

# Creating a cosine similarity from the matrix text
similarity = cosine_similarity(title_matrix)

In [12]:
# Recommender System

# A movie from the dataset is selected

def recommend(movie):
    try:

        movie_index = new_title[new_title['title'] == movie].index[0]

        # The cosine similarity marix at the index of the movie is selected
        similar_movies = list(enumerate(similarity[movie_index]))

        # Sort the cosine similarities in descending order
        sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)[1:11]

        # Printing the similar movies to the movie selected
        print(f"The top 10 movies similar to '{movie}' are: \n")

        i = 1

        for element in sorted_similar_movies:
            # The title of the  movie is gotten from the index selected in the matrix
            print(f'{i}. {new_title.iloc[element[0]].title}')
            i += 1
    except:
        print('Movie Not Found')


In [13]:
recommend('Taxi Driver')

The top 10 movies similar to 'Taxi Driver' are: 

1. Barry
2. Opening Night
3. TIGER & BUNNY
4. White Girl
5. Singham
6. Bodyguard
7. Queen Sono
8. The Beast
9. Warrior
10. The Woman in the Window


In [14]:
import pickle

In [15]:
pickle.dump(new_title.to_dict(), open('Streamlit/recommender.pkl', 'wb'))

In [16]:
pickle.dump(similarity, open('Streamlit/similarity.pkl', 'wb'))