# Group Project - Recommender System 

### Install package

In [None]:
#pip install rake_nltk --upgrade

In [None]:
#pip install opencv-python

In [None]:
#pip install wordcloud

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from pprint import PrettyPrinter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
from wordcloud import WordCloud
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Abstract Syntax Trees (ast) - https://docs.python.org/3/library/ast.html
# The ast module helps Python applications to process trees of the Python abstract syntax grammar
import ast                     # To convert the string of list to list
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import collections             # Implements specialized container datatypes:
                               #  OrderedDict - dict subclass that remembers the order entries were added
import operator                # Implements standard operators as functions:
                               #  itemgetter - fetches item from its operand
from rake_nltk import Rake     # Rapid Automatic Keyword Extraction algorithm
                               #  Uses stopwords for english from NLTK, and all puntuation characters by default
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

In [None]:
pp = PrettyPrinter(indent=4)

### Load Data

In [None]:
# Reading data files
movies_df = pd.read_csv('data/tmdb_5000_movies.csv')
credits_df = pd.read_csv('data/tmdb_5000_credits.csv')

### Examine movies dataset

In [None]:
movies_df.shape

In [None]:
movies_df.head()

In [None]:
movies_df.describe()

In [None]:
movies_df.columns

In [None]:
# Note: overview has 31 missing values
movies_df.isnull().sum()

### Overview

In [None]:
# Display rows where overview is null
movies_df[movies_df['overview'].isnull()]

In [None]:
movies_df['overview'] = movies_df['overview'].fillna('')

In [None]:
movies_df['overview'].isnull().sum()

In [None]:
movies_df['overview'][0]

In [None]:
movies_df['title'][0]

### Unpack Genres

In [None]:
movies_df['genres'][0]

In [None]:
pp.pprint(movies_df['genres'][0])

In [None]:
type(movies_df['genres'][0])

In [None]:
# Convert string of list to list
ast.literal_eval(movies_df['genres'][0])

In [None]:
# Create function to extract values from a dict
def get_names(lst):
    feat = []
    for i in ast.literal_eval(lst):
        feat.append(i['name'])          # Get the value associated with the key 'name'
    return feat

In [None]:
# Extract genres names
movies_df['genres_names'] = movies_df['genres'].apply(get_names)

In [None]:
type(movies_df['genres_names'][0])

In [None]:
movies_df['genres_names'][0]

In [None]:
pd.set_option('max_colwidth', None)

In [None]:
movies_df[['genres','genres_names']].head()

### Unpack Keywords

In [None]:
# Convert string of list to list
ast.literal_eval(movies_df['keywords'][0])

In [None]:
pp.pprint(movies_df['keywords'][0])

In [None]:
type(movies_df['keywords'][0])

In [None]:
# Extract genres names
movies_df['keywords_unpacked'] = movies_df['keywords'].apply(get_names)

In [None]:
type(movies_df['keywords_unpacked'][0])

### Examine credits dataset

In [None]:
credits_df.shape

In [None]:
# Allow strings to show full length and not get cut off
pd.set_option('max_colwidth', 500)

In [None]:
credits_df.head()

In [None]:
credits_df.describe()

In [None]:
credits_df.columns

In [None]:
credits_df.isnull().sum()

In [None]:
credits_df['title'][0]

### Extract Movie Characters from "Cast" Field

In [None]:
pp.pprint(credits_df['cast'][0])

In [None]:
ast.literal_eval(credits_df['cast'][0])

In [None]:
# Convert string of list to list
ast.literal_eval(credits_df['cast'][0])

# Create function to extract values from a dict
def get_names(lst):
    feat = []
    for i in ast.literal_eval(lst):
        feat.append(i['character'])          # Get the value associated with the key 'name'
    return feat

# Extract genres names
credits_df['character'] = credits_df['cast'].apply(get_names)

type(credits_df['character'][0])

credits_df['character'][0]

credits_df[['cast','character']].head()

### Extract Actors from "Cast" Field

In [None]:
# Convert string of list to list
ast.literal_eval(credits_df['cast'][0])

# Create function to extract values from a dict
def get_names(lst):
    feat = []
    for i in ast.literal_eval(lst):
        feat.append(i['name'])          # Get the value associated with the key 'name'
    return feat

# Extract genres names
credits_df['actors'] = credits_df['cast'].apply(get_names)

type(credits_df['actors'][0])

credits_df['actors'][0]

credits_df[['cast','actors']].head()

### Unpack Crew

In [None]:
pp.pprint(credits_df['crew'][0])

In [None]:
ast.literal_eval(credits_df['crew'][0])

In [None]:
for i in ast.literal_eval(credits_df['crew'][0]):
    print(i['job'])

In [None]:
def get_director(lst):
    feat = []
    for i in ast.literal_eval(lst):
        if i['job'].lower() =='director':
            feat.append(i['name'])          
    return feat

In [None]:
credits_df['director'] = credits_df['crew'].apply(get_director)
pd.set_option('max_colwidth', None)
credits_df[['director']].head()

In [None]:
# Allow strings to show full length and not get cut off
pd.set_option('max_colwidth', 500)

In [None]:
credits_df.head()

### Merge Datasets

In [None]:
# Merge movie and credits dataset
merged_df = pd.merge(movies_df, credits_df, left_on='id', right_on='movie_id', suffixes=('_movies.df', '_credits.df'))

In [None]:
# save the dataframe to a CSV file to use in new notebook
merged_df.to_csv('merged_df.csv', index=False)

In [None]:
merged_df['genres'] = merged_df['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
merged_df['genres'] = merged_df['genres'].str.split(',')

### Replace 31 NaN Overviews

In [None]:
nan_rows = movies_df[movies_df['overview'] == '']
nan_rows.shape

In [None]:
nan_rows.head()

In [None]:
## star wars
indices_to_fill = 2912
movies_df.loc[indices_to_fill, 'overview'] = "Luke Skywalker joins forces with a Jedi Knight, a cocky pilot, a Wookiee and two droids to save the galaxy from the Empire's world-destroying battle station, while also attempting to rescue Princess Leia from the mysterious Darth Vader."

# The Godfather: Part II
indices_to_fill = 2731
movies_df.loc[indices_to_fill, 'overview'] = "The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his grip on the family crime syndicate."

# Chiamatemi Francesco - Il Papa della gente
indices_to_fill = 2656
movies_df.loc[indices_to_fill, 'overview'] = "The story of Pope Francis' life."

# The Imitation Game
indices_to_fill = 2522
movies_df.loc[indices_to_fill, 'overview'] = "During World War II, the English mathematical genius Alan Turing tries to crack the German Enigma code with help from fellow mathematicians while attempting to come to terms with his troubled private life."

# spirited away 
indices_to_fill = 2294
movies_df.loc[indices_to_fill, 'overview'] = "During her family's move to the suburbs, a sullen 10-year-old girl wanders into a world ruled by gods, witches and spirits, a world where humans are changed into beasts."

# Back to the Future
indices_to_fill = 2285
movies_df.loc[indices_to_fill, 'overview'] = "Marty McFly, a 17-year-old high school student, is accidentally sent 30 years into the past in a time-traveling DeLorean invented by his close friend, the maverick scientist Doc Brown."

In [None]:
# Food Chains
indices_to_fill = 4431
movies_df.loc[indices_to_fill, 'overview'] = "There is so much interest in food these days yet there is almost no interest in the hands that pick that food. In the US, farm labor has always been one of the most difficult and poorly paid jobs and has relied on some of the nation's most vulnerable people. While the legal restrictions which kept people bound to farms, like slavery, have been abolished, exploitation still exists, ranging from wage theft to modern-day slavery. These days, this exploitation is perpetuated by the corporations at the top of the food chain: supermarkets. Their buying power has kept wages pitifully low and has created a scenario where desperately poor people are willing to put up with anything to keep their jobs."

# whiplash
indices_to_fill = 3865
movies_df.loc[indices_to_fill, 'overview'] = "A promising young drummer enrolls at a cut-throat music conservatory where his dreams of greatness are mentored by an instructor who will stop at nothing to realize a student's potential."

# The godfather
indices_to_fill = 3337
movies_df.loc[indices_to_fill, 'overview'] = "The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son."

# pulp fiction
indices_to_fill = 3232
movies_df.loc[indices_to_fill, 'overview'] = "The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption."

In [None]:
# forrest gump
indices_to_fill = 809
movies_df.loc[indices_to_fill, 'overview'] = "The presidencies of Kennedy and Johnson, the Vietnam War, the Watergate scandal and other historical events unfold from the perspective of an Alabama man with an IQ of 75, whose only desire is to be reunited with his childhood sweetheart."

# se7en
indices_to_fill = 1553
movies_df.loc[indices_to_fill, 'overview'] = "Two detectives, a rookie and a veteran, hunt a serial killer who uses the seven deadly sins as his motives."

# schindler's list
indices_to_fill = 1818
movies_df.loc[indices_to_fill, 'overview'] = "In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis."

# shawshank redemption
indices_to_fill = 1881
movies_df.loc[indices_to_fill, 'overview'] = "Over the course of several years, two convicts form a friendship, seeking consolation and, eventually, redemption through basic compassion."

# The Empire Strikes Back
indices_to_fill = 1990
movies_df.loc[indices_to_fill, 'overview'] = "After the Rebels are overpowered by the Empire, Luke Skywalker begins his Jedi training with Yoda, while his friends are pursued across the galaxy by Darth Vader and bounty hunter Boba Fett."

# The Silence of the Lambs	
indices_to_fill = 2091
movies_df.loc[indices_to_fill, 'overview'] = "A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims."

In [None]:
# The Wolf of Wall Street
indices_to_fill = 298
movies_df.loc[indices_to_fill, 'overview'] = "Based on the true story of Jordan Belfort, from his rise to a wealthy stock-broker living the high life to his fall involving crime, corruption and the federal government."

# The Lord of the Rings: The Return of the King
indices_to_fill = 329
movies_df.loc[indices_to_fill, 'overview'] = "Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze from Frodo and Sam as they approach Mount Doom with the One Ring."

# The Lord of the Rings: The Two Towers
indices_to_fill = 330
movies_df.loc[indices_to_fill, 'overview'] = "While Frodo and Sam edge closer to Mordor with the help of the shifty Gollum, the divided fellowship makes a stand against Sauron's new ally, Saruman, and his hordes of Isengard."

# The Lion King
indices_to_fill = 494
movies_df.loc[indices_to_fill, 'overview'] = "Lion prince Simba and his father are targeted by his bitter uncle, who wants to ascend the throne himself."

# the matrix 1994
indices_to_fill = 634
movies_df.loc[indices_to_fill, 'overview'] = "When a beautiful stranger leads computer hacker Neo to a forbidding underworld, he discovers the shocking truth--the life he knows is the elaborate deception of an evil cyber-intelligence."

# Fight Club
indices_to_fill = 662
movies_df.loc[indices_to_fill, 'overview'] = "An insomniac office worker and a devil-may-care soap maker form an underground fight club that evolves into much more."

# the green mile
indices_to_fill = 690
movies_df.loc[indices_to_fill, 'overview'] = "The lives of guards on Death Row are affected by one of their charges: a black man accused of child murder and rape, yet who has a mysterious gift."


In [None]:
# the Dark Knight
indices_to_fill = 65
movies_df.loc[indices_to_fill, 'overview'] = "When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice."

# inside out
indices_to_fill = 77
movies_df.loc[indices_to_fill, 'overview'] = "After young Riley is uprooted from her Midwest life and moved to San Francisco, her emotions - Joy, Fear, Anger, Disgust and Sadness - conflict on how best to navigate a new city, house, and school."

# Guardians of the galaxy
indices_to_fill = 94
movies_df.loc[indices_to_fill, 'overview'] = "A group of intergalactic criminals must pull together to stop a fanatical warrior with plans to purge the universe."

# Intestellar
indices_to_fill = 95
movies_df.loc[indices_to_fill, 'overview'] = "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival."

# Inception
indices_to_fill = 96
movies_df.loc[indices_to_fill, 'overview'] = "A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O., but his tragic past may doom the project and his team to disaster."

# The Lord of the Rings: The Fellowship of the Ring
indices_to_fill = 262
movies_df.loc[indices_to_fill, 'overview'] = "A meek Hobbit from the Shire and eight companions set out on a journey to destroy the powerful One Ring and save Middle-earth from the Dark Lord Sauron."

# Django Unchained
indices_to_fill = 287
movies_df.loc[indices_to_fill, 'overview'] = "With the help of a German bounty-hunter, a freed slave sets out to rescue his wife from a brutal plantation owner in Mississippi."

In [None]:
nan_rows = movies_df[movies_df['overview'] == '']
print(nan_rows.shape)
nan_rows

In [None]:
movies_df.iloc[4140]

In [None]:
# drop movie with low budget, no ratings, no overview, insufficent data
movies_df = movies_df.drop(index=4140)

In [None]:
nan_rows = movies_df[movies_df['overview'] == '']
nan_rows

### EDA Visualizations

In [None]:
C= movies_df['vote_average'].mean()
m= movies_df['vote_count'].quantile(0.9)

In [None]:
count_movies = movies_df.copy().loc[movies_df['vote_count'] >= m]
count_movies.shape

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
count_movies['score'] = count_movies.apply(weighted_rating, axis=1)

In [None]:
#Sort movies based on score calculated above
count_movies = count_movies.sort_values('score', ascending=False)

#Print the top 10 movies
count_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

In [None]:
pop= movies_df.sort_values('popularity', ascending=False)
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))

plt.barh(pop['title'].head(6),pop['popularity'].head(6), align='center',
        color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")

In [None]:
#WordCloud
# read in movie titles data
keywords = merged_df['keywords'].tolist()

In [None]:
all_keywords = ' '.join(keywords)

In [None]:
wordcloud = WordCloud(width=700, height=700, background_color='grey', max_words=200, colormap='plasma').generate(all_keywords)

In [None]:
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
type(merged_df[['runtime']])

In [None]:
plt.figure(figsize=(10, 6))  # Set the size of the figure

plt.hist(merged_df['runtime'], bins=50, color='green')

plt.title("Distribution of Average Runtime", fontsize=18)
plt.xlabel("Average Runtime", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.show()

In [None]:
plt.figure(figsize=(10, 6))  # Set the size of the figure

plt.hist(merged_df['vote_average'], bins=50, color='orange')
plt.tick_params(axis='both', which='major', labelsize=14)  # Set the font size of the tick labels
plt.title("Distribution of Vote Average", fontsize=18)
plt.xlabel("Average Votes", fontsize=14)
plt.ylabel("Frequency", fontsize=14)

plt.show()

In [None]:
plt.figure(figsize=(10, 6))  # Set the size of the figure

plt.hist(merged_df['vote_count'], bins=50, color='darkblue')  # Set the color of the histogram

plt.title("Distribution of Vote Count", fontsize=18)  # Set the font size of the title
plt.xlabel("Vote Counts", fontsize=14)  # Set the font size of the x-axis label
plt.ylabel("Frequency", fontsize=14)  # Set the font size of the y-axis label

plt.tick_params(axis='both', which='major', labelsize=12)  # Set the font size of the tick labels

plt.show()

### Review token (vocabulary) frequency distribution before removing stop words

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)  ### Review token (vocabulary) frequency distribution after removing stop words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(merged_df['overview'], 20)
print('Top 20 words in movies overview before removing stop words')
common_words

### Review token (vocabulary) frequency distribution after removing stop words

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(merged_df['overview'],20)### Review token (vocabulary) frequency distribution after removing stop words'], 20)
print('Top 20 words in movies overview after removing stop words')
common_words

### Bigrams frequency distribution before removing stop words

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(merged_df['overview'],20) ### Bigrams frequency distribution after removing stop words### Bigrams frequency distribution after removing stop words'], 20
print('Top 20 bigrams in movies overview before removing stop words')
common_words

### Bigrams frequency distribution after removing stop words

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(merged_df['overview'],20)### Bigrams frequency distribution after removing stop words'], 20 = bag_of_words.sum(axis=0) 
print('Top 20 bigrams in movies overview after removing stop words')
common_words

In [None]:
# Allow strings to show full length and not get cut off
pd.set_option('max_colwidth', 100)

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(merged_df['overview'], 20)
print('Top 20 trigrams in movies overview after removing stop words')
common_words### Trigrams frequency distribution after removing stop words

## Preprocess Data

In [None]:
# Allow strings to show full length and not get cut off
pd.set_option('max_colwidth', 100)

In [None]:
# Define stop words list
stopwords = nltk.corpus.stopwords.words('english')     # All English Stopwords

# Instantiate Porter stemmer
ps = nltk.PorterStemmer()

In [None]:
merged_df.columns

In [None]:
merged_df[['movie_id','original_title','overview','title_movies.df','genres_names','keywords_unpacked','title_credits.df','actors','director']]

In [None]:
# Create function to clean_text
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    text_2 = ' '.join(word for word in text)
    return text_2

In [None]:
# Apply clean_text function to clean movie description field
merged_df['overview_clean'] = merged_df['overview'].apply(clean_text)
merged_df.head()

In [None]:
merged_df['keywords_clean'] = merged_df['keywords_unpacked'].apply(clean_text)
merged_df.head()

In [None]:
# there are 412 movies without keywords
merged_df[merged_df['keywords_clean'] == '']

In [None]:
# Create function to lookup and print hotel description by index
def print_keywords(index):
    example = merged_df[merged_df.index == index][['keywords_clean', 'original_title']].values[0]
    if len(example) > 0:
        print('Name:', example[1])
        print('Keywords:', example[0])

In [None]:
print_keywords(4800)

In [None]:
# Create function to lookup and print hotel description by index
def print_description(index):
    example = merged_df[merged_df.index == index][['overview_clean', 'original_title']].values[0]
    if len(example) > 0:
        print('Name:', example[1])
        print('Description:', example[0])

In [None]:
print_description(4800)

In [None]:
print_description(100)

### Vectorization & Measuring Techniques

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
merged_df['overview_clean'] = merged_df['overview_clean'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(merged_df['overview_clean'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(merged_df.index, index=merged_df['title_movies.df']).drop_duplicates()

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_movie_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return merged_df['title_movies.df'].iloc[movie_indices]

In [None]:
get_movie_recommendations('The Dark Knight Rises')

In [None]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['character', 'director', 'keywords_clean', 'genres_names']
for feature in features:
    merged_df[feature] = merged_df[feature].apply(literal_eval)

In [None]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'director':
            return i['name']
    return np.nan

In [None]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [None]:
# Define new director, cast, genres and keywords features that are in a suitable form.
merged_df['director'] = merged_df['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    merged_df[feature] = merged_df[feature].apply(get_list)

In [None]:
# Print the new features of the first 3 films
merged_df[['original_title', 'character', 'director', 'keywords_unpacked', 'genres_names']].head(5)

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    merged_df[feature] = merged_df[feature].apply(clean_data)

In [None]:
def create_group(x):
   return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
merged_df['group'] = merged_df.apply(create_group, axis=1)

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(merged_df['group'])

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
# Reset index of our main DataFrame and construct reverse mapping as before
merged_df = merged_df.reset_index()
indices = pd.Series(merged_df.index, index=merged_df['original_title'])

In [None]:
get_movie_recommendations('The Dark Knight', cosine_sim2)

In [None]:
get_movie_recommendations('The Shawshank Redemption', cosine_sim2)

In [None]:
get_movie_recommendations('Frozen', cosine_sim2)

### Movie Posters

In [None]:
from PIL import Image

# Open the JPEG image
image1 = Image.open('dark.jpg')
image2 = Image.open('dark.jpg')
image3 = Image.open('Begins.jpg')
image4 = Image.open('Begins.jpg')
image5 = Image.open('Returns.jpg')
image6 = Image.open('Returns.jpg')
image7 = Image.open('JFK.jpg')
image8 = Image.open('JFK.jpg')
image9 = Image.open('Superman.jpg')
image10 = Image.open('Superman.jpg')

# Convert the JPEG images to RGBA mode
image1 = image1.convert('RGBA')
image2 = image2.convert('RGBA')
image3 = image3.convert('RGBA')
image4 = image4.convert('RGBA')
image5 = image5.convert('RGBA')
image6 = image6.convert('RGBA')
image7 = image7.convert('RGBA')
image8 = image8.convert('RGBA')
image9 = image9.convert('RGBA')
image10 = image10.convert('RGBA')

# Set the size of the images
width, height = 500, 500

# Resize the images
image1 = image1.resize((width, height))
image2 = image2.resize((width, height))
image3 = image3.resize((width, height))
image4 = image4.resize((width, height))
image5 = image5.resize((width, height))
image6 = image6.resize((width, height))
image7 = image7.resize((width, height))
image8 = image8.resize((width, height))
image9 = image9.resize((width, height))
image10 = image10.resize((width, height))

# Set the duration of each frame in the GIF (in milliseconds)
duration = 6000

# Create a list of frames for the GIF
frames = [image1, image2]
frames1 = [image3, image4]
frames2 = [image5, image6]
frames3 = [image7, image8]
frames4 = [image9, image10]

# Save the frames as a GIF
frames[0].save('my_gif.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)
frames1[0].save('my_gif1.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)
frames2[0].save('my_gif2.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)
frames3[0].save('my_gif3.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)
frames4[0].save('my_gif4.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)

In [None]:
from IPython.display import Image

with open('my_gif.gif','rb') as f1, open('my_gif1.gif','rb') as f2, open('my_gif2.gif','rb') as f3, open('my_gif3.gif','rb') as f4, open('my_gif4.gif','rb') as f5:
    image1 = Image(data=f1.read(), format='png')
    image2 = Image(data=f2.read(), format='png')
    image3 = Image(data=f3.read(), format='png')
    image4 = Image(data=f4.read(), format='png')
    image5 = Image(data=f5.read(), format='png')

display(image1, image2, image3, image4, image5)

In [None]:
from PIL import Image

# Open the JPEG image
image1 = Image.open('jungle.jpg')
image2 = Image.open('jungle.jpg')
image3 = Image.open('mulan.jpg')
image4 = Image.open('mulan.jpg')
image5 = Image.open('aladdin.jpg')
image6 = Image.open('aladdin.jpg')
image7 = Image.open('land.jpg')
image8 = Image.open('land.jpg')
image9 = Image.open('cave.jpg')
image10 = Image.open('cave.jpg')

# Convert the JPEG images to RGBA mode
image1 = image1.convert('RGBA')
image2 = image2.convert('RGBA')
image3 = image3.convert('RGBA')
image4 = image4.convert('RGBA')
image5 = image5.convert('RGBA')
image6 = image6.convert('RGBA')
image7 = image7.convert('RGBA')
image8 = image8.convert('RGBA')
image9 = image9.convert('RGBA')
image10 = image10.convert('RGBA')

# Set the size of the images
width, height = 500, 500

# Resize the images
image1 = image1.resize((width, height))
image2 = image2.resize((width, height))
image3 = image3.resize((width, height))
image4 = image4.resize((width, height))
image5 = image5.resize((width, height))
image6 = image6.resize((width, height))
image7 = image7.resize((width, height))
image8 = image8.resize((width, height))
image9 = image9.resize((width, height))
image10 = image10.resize((width, height))

# Set the duration of each frame in the GIF (in milliseconds)
duration = 6000

# Create a list of frames for the GIF
frames = [image1, image2]
frames1 = [image3, image4]
frames2 = [image5, image6]
frames3 = [image7, image8]
frames4 = [image9, image10]

# Save the frames as a GIF
frames[0].save('my_gif5.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)
frames1[0].save('my_gif6.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)
frames2[0].save('my_gif7.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)
frames3[0].save('my_gif8.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)
frames4[0].save('my_gif9.gif', format='GIF', append_images=frames[1:], save_all=True, duration=duration, loop=0)

In [None]:
from IPython.display import Image

with open('my_gif5.gif','rb') as f1, open('my_gif6.gif','rb') as f2, open('my_gif7.gif','rb') as f3, open('my_gif8.gif','rb') as f4, open('my_gif9.gif','rb') as f5:
    image1 = Image(data=f1.read(), format='png')
    image2 = Image(data=f2.read(), format='png')
    image3 = Image(data=f3.read(), format='png')
    image4 = Image(data=f4.read(), format='png')
    image5 = Image(data=f5.read(), format='png')

display(image1, image2, image3, image4, image5)