In [58]:
import pandas as pd
import numpy as np
from umap import UMAP
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.random_projection import SparseRandomProjection
from sklearn.compose import ColumnTransformer

sns.set_theme(context="notebook", style="ticks")


### Basic idea
|   |   |   |
|---|---|---|
|   |   |   |
|   |   |   |

Load file

In [59]:
netflix_total = pd.read_csv("netflix_titles.csv")


Remove spaces

In [60]:
netflix_total.cast = netflix_total.cast.str.replace(", ", ",")
netflix_total.director = netflix_total.director.str.replace(", ", ",")
netflix_total.listed_in = netflix_total.listed_in.str.replace(", ", ",")
netflix_total.country = netflix_total.country.str.replace(", ", ",")


Split in TV Shows and Movies

In [61]:
tv = netflix_total[netflix_total.type == "TV Show"]
movies = netflix_total[netflix_total.type == "Movie"]

netflix_total.reset_index(inplace=True)
movies.reset_index(inplace=True)
tv.reset_index(inplace=True)


In [98]:
def preprocess_dataset(data):
    """Main preprocessing steps."""
    data = binarize_categories(data)
    data = idf_vectorize(data)
    knn = fit_knn(data)
    return data, knn


def binarize_categories(data):
    """Convert categorical values in binary feature vectors."""
    oh_enc = OneHotEncoder(sparse=False)
    data_binarized = oh_enc.fit_transform(
        netflix_total[["director", "cast", "listed_in", "country", "rating"]])

    return data_binarized


def idf_vectorize(data):
    """Normalize each feature according to the inverse document frequency."""
    df = data.sum(axis=0)
    idf = np.log((1+len(data)) / (1+df)) + 1
    data_vectorized = data * idf.T
    return data_vectorized


def fit_knn(data):
    knn = NearestNeighbors(
        n_neighbors=11,
        metric="correlation",
        n_jobs=-1
    ).fit(data)

    return knn


In [99]:
def recommend_show(data_reduced, data, query, knn):
    """Compute nearest neighbors and return show/film names from the original dataset."""

    neighbors = knn.kneighbors(data_reduced[data.title == query])
    result = pd.concat([
        pd.DataFrame(
            np.flip(neighbors[0][-1]),
            index=np.flip(neighbors[1][-1]),
            columns=["Score"]),
        data.iloc[np.flip(neighbors[1][-1])][["title", "director", "cast", "country", "listed_in", "type"]]],
        axis=1
    )
    return result.iloc[:10]


In [100]:
data = netflix_total
data_reduced, knn = preprocess_dataset(data)


In [101]:
title = "Stargate"
recommend_show(data_reduced, data, title, knn)


Unnamed: 0,Score,title,director,cast,country,listed_in,type
7581,0.78981,Next,Lee Tamahori,"Nicolas Cage,Julianne Moore,Jessica Biel,Thoma...",United States,"Action & Adventure,Sci-Fi & Fantasy",Movie
143,0.788298,Green Lantern,Martin Campbell,"Ryan Reynolds,Blake Lively,Peter Sarsgaard,Mar...",United States,"Action & Adventure,Sci-Fi & Fantasy",Movie
4484,0.783281,Scorpion King 5: Book of Souls,Don Michael Paul,"Zach McGowan,Pearl Thusi,Nathan Jones,Peter Me...",United States,"Action & Adventure,Sci-Fi & Fantasy",Movie
8034,0.777524,Skydancers,Fredric Lean,,"United States,France","Documentaries,Sports Movies",Movie
5829,0.764599,Popples,,"Erin Fitzgerald,Wendee Lee,Cassandra Morris,Ci...","United States,France",Kids' TV,TV Show
1048,0.764599,Power Players,,"Kieran Walton,Carlos Salazar,Greg Chun,Jamieso...","United States,France",Kids' TV,TV Show
356,0.748713,The Losers,Sylvain White,"Jeffrey Dean Morgan,Zoe Saldana,Chris Evans,Id...","United States,France",Action & Adventure,Movie
1387,0.747477,The Next Three Days,Paul Haggis,"Russell Crowe,Elizabeth Banks,Brian Dennehy,Le...","United States,France","Dramas,Thrillers",Movie
5981,0.669284,"10,000 B.C.",Roland Emmerich,"Steven Strait,Camilla Belle,Cliff Curtis,Joel ...","United States,South Africa",Action & Adventure,Movie
1113,0.478474,2012,Roland Emmerich,"John Cusack,Amanda Peet,Chiwetel Ejiofor,Thand...",United States,"Action & Adventure,Sci-Fi & Fantasy",Movie


### Conclusion
- Works fine for Action/Thriller Films
- Also Comedy?
- TV Shows/Anime seems a bit random (?)
- probably because the style of a Hollywood Blockbuster is more defined by Director/Cast than TV SHows or Anime
- Problem: Missing Data (Directors for TV Shows)

In [102]:
# spr = SparseRandomProjection(
#     n_components=50
# )

# data_reduced = spr.fit_transform(data_reduced)

# mapper = UMAP(
#     n_neighbors=100,
#     metric="correlation",
#     densmap=True
# )

# embedding = mapper.fit_transform(data_reduced)


In [103]:
# %matplotlib widget
# plt.figure(figsize=(6,6))
# sns.scatterplot(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     s=3,
#     # hue=movies.rating

# )


In [104]:
# import plotly.express as px

# px.scatter(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     hover_name=netflix_total.title,
#     # size=[0.01]*len(embedding)
# )
