
# Import Resources

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Embed visualisation within the notebook
%matplotlib inline

# Import Data Set

In [2]:
# Display all columns
pd.set_option('display.max_columns', None)

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/My project/Dataset CSV/all_movie.csv',delimiter=',')

# Data Cleansing
False & Missing Values

In [3]:
# Replacing missing values with ''
df = df.replace(to_replace=[
    'Cast Not Available', 
    'Director Not Available', 
    'Writer Not Available', 
    'Runtime Not Available',
    'Studio Not Available'
], value='')

# Replacing NaN for the text processing libraries that follow
df.fillna('', inplace=True)

# Aggregation
The dataset contains a duplicate row for each film genre. We need to aggregate these into a single space delimited genre value and in the process remove the duplicate films.

In [4]:
# Create a dict of axis labels -> functions for DataFrame.agg()
fields = dict(zip(list(filter(lambda a: a not in ['Title', 'Year'], df.keys())), ['first'] * len(df.keys())))
fields['Genre'] = ' '.join

# Aggregate Genre, removing duplicate films
df = df.groupby(['Title', 'Year'], as_index=False).agg(fields)

df = df.reset_index(drop=True)

# Text Feature Extraction
Removes names of people involved with the film from the description field. Names are only removed if a match is found in the remove_associated_names variable comprised of cast and crew associated with the film.

This is to preserve character names appearing as proper nouns.

In [5]:
remove_associated_names = ['Cast 1', 'Cast 2', 'Cast 3', 'Cast 4', 'Cast 5', 'Cast 6',
                         'Director 1', 'Director 2', 'Director 3', 
                         'Writer 1', 'Writer 2', 'Writer 3', 'Writer 4']

for idx, item in enumerate(df['Description']):
    item = str(item)

    for rem in remove_associated_names:
        word = df.loc[idx,F'{rem}']
        words = word.split()
        for word in words:
            item = item.replace(word,'')

        items = item.split('~',1)
        df.loc[idx,'New Description'] = items[0]

Removing stopwords, stemming and lemmatizing film descripions

In [6]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer



stopwords = stopwords.words('english')

# Add custom stopwords for the movie database
movie_db_stopwords = ['(c)', 
                    'academy', 'adapted', 'award', 
                    'big-screen', 'box-office', 'bros', 
                    'c', 'cameo', 'cameos', 'cinema', 'cinematic', 'classic', 'co-star', 'co-stars', 'co-wrote', 
                    'debut', 'director', 'directorial', 'disney', 
                    'feature', 'feature-length', 'film', 'films', 'fox', 'franchise', 
                    'genre', 
                    'leading-lady', 'leading-man', 
                    'mgm', 'movie', 'movies', 
                    'oscar', 
                    'paramount', 'pictures', 'premiere', 'produced', 'producer', 'producers', 
                    'radius', 'reliance', 'rovi', 
                    'screen', 'script', 'sequel', 'sony', 'star', 'staring', 'stars', 
                    'twc', 
                    'universal', 
                    'walt', 'warner', 'writer', 'writers']

stopwords.extend(movie_db_stopwords)


# Receives a document, performs Tokenization, stop word removal, punctuation removal and Lemmatisation
# Returns a bag of words in list format
def nlp_cleaner(document):
    # Remove punctuation
    document = re.sub(r'[^\w\s]','',document) 

    # Make all chars lowercase
    document = document.lower()

    # Tokenize
    tokenized_document = nltk.word_tokenize(document)

    # Remove stopwords
    removable_words = set(stopwords)
    stop_word_removed = []
    for words in tokenized_document:
        if words not in removable_words:
            stop_word_removed.append(words)
            
    # The lemmatization code herein is adapted from https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/ 

    # Create instance of lemmatizer
    wnl = WordNetLemmatizer()

    # Function to apply appropriate tagging to the document to identify word types as adjective, verb, noun or adverb
    # this works in conjuntion with the nltk.pos_tag function by mapping the tags to to the wordnet POS tags 
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:          
            return None

    # Tag the document using nltk.pos_tag 
    pos_tagged = nltk.pos_tag(stop_word_removed)  

    # Use the Pos_Tag mapping function to change the tags to wordnet format
    wnt = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))

    lemmatized_document = []

    # Lemmatise the document using the tags to identify word types so that the base words are accurately derived
    for word, tag in wnt:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_document.append(word)
        else:        
            # else use the tag to lemmatize the token
            lemmatized_document.append(wnl.lemmatize(word, tag))

    # The document is now fully transformed into a collection of tokens that can now be utalised by NLP algorithms
    return ' '.join(lemmatized_document)


# Create a new column containing the normalised descriptions
df['NLP Optimised'] = df['New Description'].apply(nlp_cleaner)

df[['Description', 'NLP Optimised']].sample(10, random_state=1)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Unnamed: 0,Description,NLP Optimised
6181,"K-11 follows Raymond Saxx Jr. (Goran Visnjic),...",k11 follow raymond saxx jr powerful record wak...
10776,This film's plot is set in motion by avariciou...,plot set motion avaricious ivory hunter cavana...
2666,Justin Long and Emmy Rossum are star-crossed l...,starcrossed lover whose relationship blooms un...
14393,A woman visits her rich uncle before taking he...,woman visit rich uncle take vow nun die leave ...
13887,"This bawdy, funny adaptation of Henry Fielding...",bawdy funny adaptation henry fielding novel fo...
7989,A contemporary tale of friendship set against ...,contemporary tale friendship set backdrop big ...
3455,Comic Chris Rock co-scripted and stars in this...,comic coscripted remake 1941s come mr jordan a...
9913,American diamond merchant Lucas goes to Russia...,american diamond merchant lucas go russia deal...
11649,"On behalf of Cinedigm and Tribeca Film, we are...",behalf cinedigm tribeca please share official ...
14161,The devastating reverberations of a profound t...,devastate reverberation profound tragedy echo ...


# Content-based Recommendations
TF-IDF on Description

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# instantiate TfidfVectorizer object with stopwords
tfidf = TfidfVectorizer(stop_words='english')

# Create a vector representation of the plot descriptions
tfidf_matrix = tfidf.fit_transform(df['NLP Optimised']) 

print('TF-IDF matrix:')
print(tfidf_matrix.shape[1], 'words against', tfidf_matrix.shape[0], 'films')

TF-IDF matrix:
61274 words against 15074 films



# Cosine Similarity
We're using linear_kernel because it's faster than cosine_similarity but returns the same result.

In [8]:
from sklearn.metrics.pairwise import linear_kernel

# Create cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim.shape

(15074, 15074)

In [9]:
import requests
from IPython.display import HTML, display

# Create a lookup table to find row index by film title
indices = pd.Series(df.index, index=df['Title'])

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    film_indices = cosine_sim[idx].argsort()[-2:-12:-1] 

    df2 = df.copy()
    df2['Score'] = cosine_sim[idx][df2.index]
    
    return df2[['Title', 'Description', 'Genre', 'Score']].iloc[film_indices]

def get_posters(results):
    html = '<div>'
    for t in results['Title']:
        url = 'https://api.themoviedb.org/3/search/movie?api_key=15d2ea6d0dc1d476efbca3eba2b9bbfb&query='
        json = requests.get(url + t).json()
        if len(json['results']):
            poster = 'http://image.tmdb.org/t/p/w500/' + json['results'][0]['poster_path']
            html += '<img src="' + poster + '" width="90px" style="float: left; margin: 2px" />'
        
    html += '</div>'
    
    return html

# Output

In [11]:
results = get_recommendations("Men in Black")
display(results)
HTML(get_posters(results))

Unnamed: 0,Title,Description,Genre,Score
7402,Men in Black III,"In Men in Black 3, Agents J (Will Smith) and K...",Action Comedy SciFi,0.189196
7401,Men in Black II,"Otherworldly villains are on the loose again, ...",Comedy SciFi,0.151514
5963,James White,James White (Christopher Abbott) is a troubled...,Drama,0.127315
5517,I Married a Monster from Outer Space,"Despite its title, this is a well-regarded sci...",Classics SciFi,0.123238
4403,From Paris with Love,"This action film, directed by Pierre Morel (Ta...",Action,0.123143
10245,Species,An alien comes to Earth to breed so that her k...,SciFi,0.113325
7208,Maniac Cop 2,A killer dressed in a police uniform begins mu...,Action Horror Mystery,0.112313
1659,Black Cloud,Actor Rick Schroder makes his directorial debu...,Action Drama,0.11173
6590,Lazer Team,LAZER TEAM begins decades ago when the Search ...,SciFi,0.1081
9884,Shorts: The Adventures of the Wishing Rock,A young boy living in a cookie-cutter suburb g...,Action Comedy Kids&Family SciFi,0.107876
