In [1]:
import numpy as np
import pandas as pd
import math
import random
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

% matplotlib inline

Like in the other notebook, we're making a matrix of movie similarity to other movies.

In [2]:
dfs = []
for year in range(1940, 2018):
    dfs.append(pd.read_csv('scraped_movies/top_movies_of_%d.csv' % year, encoding = 'cp1252'))
movie_data = pd.concat(dfs)

In [3]:
dfs = []
for year in range(1940, 2018):
    dfs.append(pd.read_csv('scraped_movies/actors_for_top_movies_of_%d.csv' % year, encoding = 'utf-8'))
actors = pd.concat(dfs)

In [4]:
dfs = []
for year in range(1940, 2018):
    dfs.append(pd.read_csv('scraped_movies/keywords_for_top_movies_of_%d.csv' % year, encoding = 'utf-8'))
keywords = pd.concat(dfs)

In [5]:
title_lookup = pd.Series(movie_data.title)
title_lookup.index = movie_data.IMDbId
title_lookup = title_lookup.to_dict()



In [6]:
movie_data.index = range(len(movie_data))
actors.index = range(len(actors))
keywords.index = range(len(keywords))

But we can make two matrixes now, one encoding the information about keywords, the other the information about what actors the movies have in common.

In [7]:
def make_matrix(df, column_name, countvectoriser, tfidf): 
    sparse = countvectoriser.fit_transform(pd.Series(df[column_name].fillna('').values))
    weighted = tfidf.fit_transform(sparse)    
    matrix = weighted.dot(weighted.T)
    movies = pd.Series(countvectoriser.get_feature_names())
    return matrix

In [8]:

vlad_ = CountVectorizer(tokenizer = lambda x: x.split('|'), min_df = 5)
megatron_ = TfidfTransformer()
actor_matrix = make_matrix(actors, 'actors', vlad_, megatron_)

In [9]:

vlad = CountVectorizer(tokenizer = lambda x: x.split('|'), min_df = 10)
megatron = TfidfTransformer()
keyword_matrix = make_matrix(keywords, 'keywords', vlad_, megatron_)

This is where stuff gets kinda wild: Because the two matrixes share dimensionality and encode similar information (which movies are similar to each other), we can sum them together, rather than concatenating them. Then we can apply dimensionality reduction to this matrix that combines the two sets of information.

In [10]:
shrinky = NMF(n_components = 100)

both = actor_matrix + keyword_matrix

shrinky.fit(both.toarray())

NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=100, nls_max_iter=2000, random_state=None, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [11]:
both_shrunk_100 = shrinky.transform(both.toarray())

In [12]:

actors_shrunk_100 = shrinky.transform(actor_matrix.toarray())

In [13]:

keywords_shrunk_100 = shrinky.transform(keyword_matrix.toarray())

Because the shrunk matrixes contain the same columns, with the same meaning, we can find similar movies based on actors, or based on keywords, or, weirdest of all, we can get the embedding of a movie's keywords, and then find the most similar movies in the actors matrix. In other words, if we know one movie's actors, and another movie's keywords, we can say how similar they are, even though we have no common points of information.

In [20]:
movie = list(keywords.IMDbId.map(title_lookup)).index("Alien (1979)")
target = keywords_shrunk_100[movie].reshape(1, -1)

In [21]:
[keywords.IMDbId.map(title_lookup)[i] for i in actors_best_list]

['Red Planet (2000)',
 'Alien³ (1992)',
 'Alien Resurrection (1997)',
 'Alien (1979)',
 'Dark Star (1974)',
 'Lost in Space (1998)',
 'Battlefield Earth (2000)',
 'Mission to Mars (2000)',
 'Pitch Black (2000)',
 'Hollow Man (2000)',
 'Alien: Covenant (2017)',
 'Event Horizon (1997)',
 'SpaceCamp (1986)',
 'Life (I) (2017)',
 'AVP: Alien vs. Predator (2004)',
 'Elysium (I) (2013)',
 'Riddick (2013)',
 'Cosmos (1980– )',
 'Aliens (1986)',
 'Mama (I) (2013)']

But we can make it even stranger! We can make a matrix which has a row for each keyword or actor, and a column for whether that keyword/actor is in a particular movie. Then we can apply the same dimensionality reduction as we learned from before. We end up with an embedding of the single keyword or actor that is compatible with the embedding of movies, actors, or keywords - it's a common language to describe all of them!

In [22]:
vlad = CountVectorizer(tokenizer = lambda x: x.split('|'), min_df = 5)
megatron = TfidfTransformer()

sparse = vlad.fit_transform(pd.Series(keywords['keywords'].fillna('').values))
weighted = megatron.fit_transform(sparse)
shrunk = shrinky.transform(weighted.transpose().toarray())
keywords_df = pd.DataFrame(shrunk, index=vlad.get_feature_names())

In [23]:
vlad = CountVectorizer(tokenizer = lambda x: x.split('|'), min_df = 10)
megatron = TfidfTransformer()

sparse = vlad.fit_transform(pd.Series(actors['actors'].fillna('').values))
weighted = megatron.fit_transform(sparse)
shrunk = shrinky.transform(weighted.transpose().toarray())
actors_df = pd.DataFrame(shrunk, index=vlad.get_feature_names())


Finding keywords that go with an actor

In [51]:
target = actors_df.loc['john wayne'].values.reshape(1, -1)

In [52]:
best_list = [i for i in np.argsort(cosine_similarity(target, keywords_df))[0][::-1]][:30]
keywords_df.iloc[best_list].index

Index(['cowboys-and-indians', 'western-frontier', 'carbine', 'horse-thief',
       'yaqui-indian', 'chief', 'buried-to-the-neck', 'stars-and-stripes',
       'edited-from-tv-series', 'hung-by-wrists', 'wagon', 'bronco',
       'cherokee', 'confederate', 'land-baron', 'monument-valley',
       'kiowa-indian', 'reference-to-robert-e.-lee', 'native-american-tribe',
       'horse-riding', 'navajo-indian', 'long-range-rifle', 'horse',
       'hit-with-a-gun', 'cantina', 'falling-off-a-horse', 'renegade',
       'peace-pipe', 'native-american-attack', 'american-civil-war-veteran'],
      dtype='object')

Finding actors that go with a keyword

In [37]:
target = keywords_df.loc['blaxploitation'].values.reshape(1, -1)

In [38]:
best_list = [i for i in np.argsort(cosine_similarity(target, actors_df))[0][::-1]][:30]
actors_df.iloc[best_list].index

Index(['clifton powell', 'terrence howard', 'roger guenveur smith',
       'tyra ferrell', 'ice-t', 'leonard l. thomas', 'spike lee',
       'tamala jones', 'martin lawrence', 'theresa randle', 'bernie mac',
       'steve white', 'khandi alexander', 'tommy 'tiny' lister',
       'clarence williams iii', 'meagan good', 'ossie davis', 'lela rochon',
       'ice cube', 'robert townsend', 'giancarlo esposito', 'antonio fargas',
       'sidney poitier', 'ruby dee', 'bill nunn', 'vivica a. fox',
       'blair underwood', 'chris tucker', 'lawanda page', 'regina hall'],
      dtype='object')

Finding movies from a combination of actor and keyword

In [65]:
target = keywords_df.loc['blaxploitation'].values.reshape(1, -1) + actors_df.loc['john wayne'].values.reshape(1, -1)

best_list = [i for i in np.argsort(cosine_similarity(target, both_shrunk_100))[0][::-1]][:30]
[keywords.IMDbId.map(title_lookup)[i] for i in best_list]

['Men of Honor (2000)',
 'Posse (1993)',
 'Mandingo (1975)',
 'Car Wash (1976)',
 'Billy Two Hats (1974)',
 'Band of Angels (1957)',
 'Little Big Man (1970)',
 'Hostiles (2017)',
 'The Searchers (1956)',
 "Let's Do It Again (1975)",
 'The Brothers (2001)',
 'Higher Learning (1995)',
 'The Way West (1967)',
 'Do the Right Thing (1989)',
 'Trooper Hook (1957)',
 'Roots (1977– )',
 'The Mack (1973)',
 'School Daze (1988)',
 'McLintock! (1963)',
 'The Players Club (1998)',
 'The Unforgiven (1960)',
 'A Patch of Blue (1965)',
 'Hondo (1953)',
 'No Way Out (1950)',
 'Life (I) (1999)',
 'Think Like a Man Too (2014)',
 'Barbershop (2002)',
 'Geronimo: An American Legend (1993)',
 "A Soldier's Story (1984)",
 'The Villain (1979)']