In [1]:
# required libraries
import pandas as pd
import pickle
import spacy
from gensim.corpora import Dictionary
from scipy.spatial import distance
from tqdm import tqdm

In [2]:
# imports for the movie database and topic model
movie_database = pd.read_csv("Movie-Plots.csv")
topic_model = pickle.load(open("Topic_model.sav",'rb'))

In [3]:
# builds an nlp pipeline
nlp = spacy.load("en", disable=["parser", 'ner'])

def lemmatize_nlp(doc, pos = ['NOUN', 'ADJ', 'ADV', 'VERB']):
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-' and token.pos_ in pos]
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

def remove_stopwords(doc):
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

nlp.add_pipe(lemmatize_nlp, name='lemmatizer', after='tagger')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

# maps the nlp pipeline to the texts and initializes the dictionary
print("Loading texts and dictionary...")
tqdm.pandas()
processed_texts = movie_database['Plot'].str.lower().progress_map(nlp)

dictionary = Dictionary(processed_texts)
dictionary.filter_extremes(no_below = 300, no_above = 0.5)
print("DICTIONARY CREATED!")

# function to assign topic distribution vectors to texts
def assign_topic_dist(text, model = topic_model, dictionary = dictionary):
    bag_of_words = dictionary.doc2bow(nlp(text))

    prob_list = []
    indlist = []
    
    for i, score in model[bag_of_words]:
        indlist.append(i)
        prob_list.append(round(score,1))
    
    n_topics = model.print_topics(-1)[-1][0] + 1
    diff = list(set(list(range(0,n_topics))).difference(indlist))

    while len(prob_list) < 12:
        for pos in diff:
            prob_list.insert(pos, 0.0)
        
    return prob_list

print("Generating topic distributions...")
td_vectors = movie_database['Plot'].progress_map(assign_topic_dist)
print("Topic distributions ready!")

  from pandas import Panel
  0%|          | 0/24913 [00:00<?, ?it/s]

Loading texts and dictionary...


100%|██████████| 24913/24913 [05:09<00:00, 80.50it/s] 
  0%|          | 9/24913 [00:00<04:41, 88.57it/s]

DICTIONARY CREATED!
Generating topic distributions...


100%|██████████| 24913/24913 [05:47<00:00, 71.71it/s]

Topic distributions ready!





In [4]:
# function that finds similar movies for the recommendation system
# It takes 4 parameters: 
                        # 'movie' is the name of the movie
                        # 'year' is the year the movie was released
                        # 'num' number of similar movies to return
                        # 'vectors' is the topic distribution vector for the movie database
# None of these need to be modified as the recommendation system has prompts to get the 1st three parameters. The vectors is loaded from the functions
def find_similar(movie, year, num, vectors = td_vectors):
    similarity = []
    
    # gets the index of the movie entered by user from the database
    movie_index = movie_database[(movie_database.Title == movie) & (movie_database['Release Year'] == year)].index[0] 
    
    # gets the plot of the movie entered by user
    movie_plot = movie_database[(movie_database.Title == movie) & (movie_database['Release Year'] == year)].Plot.values[0] 
    
    assigned_movie_topic_dist = assign_topic_dist(movie_plot) # gets the assigned topic distribution

    for i, v in enumerate(vectors):
        dist = distance.jensenshannon(assigned_movie_topic_dist, v) # computes jensen shannon distance between movies
        similarity.append([i,dist]) # appends the similarity scores
    
    similarity = [x for x in similarity if x[0] != movie_index] # removes the movie entered by user from the recommendation list
    similarity = sorted(similarity, key = lambda x: x[1])[:num] # filters out the top 'num' similar movies
    return similarity


# Recommendation system
def recommend_movies():
    
    num = int(input("Enter the number of movies to be recommended (upto 100): ")) # user inputs the number of recommendations
    
    movie_input = input("Enter the full name of the movie (initials capitalized): ") # user inputs the movie
    
    if movie_input in list(movie_database.Title):
        
        movie_options = movie_database[movie_database.Title == movie_input].iloc[:,:8].reset_index(drop = True) 
        display(movie_options) # offer all movies with that name entered by user
        year_input = int(input("Confirm your movie (enter number on the left): ")) # ask user to select the right movie
        
        # handles erroneous values
        if year_input > len(movie_options)-1 or year_input < 0:
            print("Wrong input, try again!")
            year_input = int(input("Confirm your movie (enter number on the left): "))
            
            if year_input > len(movie_options)-1 or year_input < 0:
                print("Sorry! Please start again")
                return
            else:
                year = movie_options.iloc[year_input,:]['Release Year']
            
        else:
            year = movie_options.iloc[year_input,:]['Release Year'] # gets the year of the movie to identify the exact match

        x = find_similar(movie_input, year, num) # finds similar movies

        print("\n=== Top " + str(num) + " movies similar to " + "\"" + movie_input + "\"" + " ===\n")
        
        similar_movies_dict = {"Movie": [], "Genre": [], "Year": [], "Decade": []} # dictionary to save similar movies
        
        # append values to the dictionary
        for v in x:
            similar_movies_dict['Movie'].append(movie_database['Title'][v[0]])
            similar_movies_dict['Genre'].append(movie_database['Genre'][v[0]])
            similar_movies_dict['Year'].append(movie_database['Release Year'][v[0]])
            similar_movies_dict['Decade'].append(movie_database['Decade'][v[0]])
        
        # convert dictionary to dataframe for display
        similar_movies_df = pd.DataFrame(similar_movies_dict)
        display(similar_movies_df)
        
        # ask for recommendations across different decades
        decade_rec = input("Would you like to get recommendations across different decades? (Yes/No): ")
        
        if decade_rec == "Yes" or decade_rec == "yes" or decade_rec == "y" or decade_rec == "Y":
            similar_movies_df = similar_movies_df.groupby("Decade").first().reset_index() # returns most similar movies across different decades
            return similar_movies_df
        elif decade_rec == "No" or decade_rec == "no" or decade_rec == "n" or decade_rec == "N":
            return "ENJOY!"
        else:
            print("Invalid ")
    
    else: # handles errors
        print("Sorry! Check the spelling and/or completeness of the movie.\n")
        print("Note: Some movies aren't available in the database\n")
        
        exit_option = input("Do you wish to exit? ")
        
        if exit_option == "Yes" or exit_option == "yes" or exit_option == "y" or exit_option == "Y":
            return
            
        elif exit_option == "No" or exit_option == "no" or exit_option == "n" or exit_option == "N":
            return recommend_movies(num = 10)
        
        else:
            print("Thank you for using the movie recommender! Try another movie")
            return
    
    return similar_movies_df

In [6]:
recommend_movies()

Enter the number of movies to be recommended (upto 100):  10
Enter the full name of the movie (initials capitalized):  Interstellar


Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,2014,Interstellar,British,Director: Christopher Nolan,Director: Christopher Nolan\r\nCast: Matthew M...,unknown,https://en.wikipedia.org/wiki/Interstellar_(film),"In the mid-21st century, crop blights and dust..."


Confirm your movie (enter number on the left):  0



=== Top 10 movies similar to "Interstellar" ===



Unnamed: 0,Movie,Genre,Year,Decade
0,Contact,science fiction,1997,1990s
1,Apollo 18,"horror, science fiction",2011,2010s
2,Surrogates,"action, science fiction",2009,2000s
3,Resident Evil: Retribution,science fiction thriller,2012,2010s
4,Terminator Genisys,action adventure science fiction,2015,2010s
5,The Angry Red Planet,sci-fi,1959,1950s
6,The Puppet Masters,science fiction,1994,1990s
7,X-Men: Days of Future Past,unknown,2014,2010s
8,Icebreaker,action,2000,2000s
9,Infini,sci-fi,2015,2010s


Would you like to get recommendations across different decades? (Yes/No):  y


Unnamed: 0,Decade,Movie,Genre,Year
0,1950s,The Angry Red Planet,sci-fi,1959
1,1990s,Contact,science fiction,1997
2,2000s,Surrogates,"action, science fiction",2009
3,2010s,Apollo 18,"horror, science fiction",2011
