In [39]:
import pandas as pd
import numpy as np
from umap import UMAP
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.random_projection import SparseRandomProjection

sns.set_theme(context="notebook", style="ticks")


### Basic idea
|   |   |   |
|---|---|---|
|   |   |   |
|   |   |   |

Load file

In [40]:
netflix_total = pd.read_csv("netflix_titles.csv")


Remove spaces

In [41]:
netflix_total.cast = netflix_total.cast.str.replace(", ", ",")
netflix_total.director = netflix_total.director.str.replace(", ", ",")
netflix_total.listed_in = netflix_total.listed_in.str.replace(", ", ",")
netflix_total.country = netflix_total.country.str.replace(", ", ",")


Split in TV Shows and Movies

In [42]:
tv = netflix_total[netflix_total.type == "TV Show"]
movies = netflix_total[netflix_total.type == "Movie"]

netflix_total.reset_index(inplace=True)
movies.reset_index(inplace=True)
tv.reset_index(inplace=True)


In [60]:
def preprocess_dataset(data):
    """Main preprocessing steps."""
    data = binarize_categories(data)
    data = idf_vectorize(data)
    knn = fit_knn(data)
    return data, knn


def binarize_categories(data):
    """Convert categorical values in binary feature vectors."""
    data.director.fillna("", inplace=True)
    data.cast.fillna("", inplace=True)
    data.listed_in.fillna("", inplace=True)
    data.country.fillna("", inplace=True)
    data.rating.fillna("", inplace=True)

    mlb = MultiLabelBinarizer()
    director_binarized = mlb.fit_transform(data.director.str.split(","))
    cast_binarized = mlb.fit_transform(data.cast.str.split(","))
    listed_in_binarized = mlb.fit_transform(data.listed_in.str.split(","))
    countries_binarized = mlb.fit_transform(data.country.str.split(","))
    rating_binarized = mlb.fit_transform(data.rating.str.split(","))

    data_binarized = pd.concat([
        pd.DataFrame(director_binarized),
        pd.DataFrame(cast_binarized),
        pd.DataFrame(listed_in_binarized),
        pd.DataFrame(countries_binarized),
        pd.DataFrame(rating_binarized)],
        axis=1,
        ignore_index=True
    )

    return data_binarized


def idf_vectorize(data):
    """Normalize each feature according to the inverse document frequency."""
    df = data.sum(axis=0)
    idf = np.log((1+len(data)) / (1+df)) + 1
    data_vectorized = data * idf.T
    norms = np.linalg.norm(data_vectorized, axis=1)
    data_vectorized = (data_vectorized.T / norms).T

    return data_vectorized


def fit_knn(data):
    knn = NearestNeighbors(
        n_neighbors=11,
        metric="cosine",
        n_jobs=-1
    ).fit(data)

    return knn


In [62]:
def recommend_show(data_reduced, data, query, knn):
    """Compute nearest neighbors and return show/film names from the original dataset."""

    neighbors = knn.kneighbors(data_reduced[data.title == query])
    result = pd.concat([
        pd.DataFrame(
            np.flip(neighbors[0][-1]),
            index=np.flip(neighbors[1][-1]),
            columns=["Score"]),
        data.iloc[np.flip(neighbors[1][-1])][["title", "director", "cast", "country", "listed_in", "type"]]],
        axis=1
    )
    return result.iloc[:10]


In [61]:
data = netflix_total
data_reduced, knn = preprocess_dataset(data)


In [63]:
title = "BoJack Horseman"
recommend_show(data_reduced, data, title, knn)


Unnamed: 0,Score,title,director,cast,country,listed_in,type
4846,0.833795,The Break with Michelle Wolf,,,United States,"Stand-Up Comedy & Talk Shows,TV Comedies",TV Show
4639,0.833795,Norm Macdonald Has a Show,,,United States,"Stand-Up Comedy & Talk Shows,TV Comedies",TV Show
242,0.830623,Comedy Premium League,,,,"International TV Shows,TV Comedies",TV Show
2718,0.824595,Community,,"Ken Jeong,Jim Rash,Donald Glover,Joel McHale,G...",United States,TV Comedies,TV Show
4845,0.821575,Arrested Development,,"Jason Bateman,Portia de Rossi,Will Arnett,Mich...",United States,TV Comedies,TV Show
1716,0.817524,Aunty Donna's Big Ol' House of Fun,,"Mark Samual Bonanno,Broden Kelly,Zachary Ruane...",Australia,TV Comedies,TV Show
5885,0.798847,W/ Bob & David,,"David Cross,Bob Odenkirk,John Ennis,Jay Johnst...",United States,TV Comedies,TV Show
8282,0.77643,The Drunk and on Drugs Happy Funtime Hour,,"John Paul Tremblay,Robb Wells,Maury Chaykin,Lu...",Canada,"International TV Shows,TV Comedies",TV Show
3428,0.714152,El Camino: A Breaking Bad Movie,Vince Gilligan,Aaron Paul,United States,"Dramas,Thrillers",Movie
5920,0.503678,BoJack Horseman Christmas Special: Sabrina's C...,,"Will Arnett,Aaron Paul,Alison Brie,Adam Conove...",United States,Movies,Movie


### Conclusion
- Works fine for Action/Thriller Films
- Also Comedy?
- TV Shows/Anime seems a bit random (?)
- probably because the style of a Hollywood Blockbuster is more defined by Director/Cast than TV SHows or Anime
- Problem: Missing Data (Directors for TV Shows)

In [47]:
# spr = SparseRandomProjection(
#     n_components=50
# )

# data_reduced = spr.fit_transform(data_reduced)

# mapper = UMAP(
#     n_neighbors=100,
#     metric="correlation",
#     densmap=True
# )

# embedding = mapper.fit_transform(data_reduced)


In [48]:
# %matplotlib widget
# plt.figure(figsize=(6,6))
# sns.scatterplot(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     s=3,
#     # hue=movies.rating

# )


In [49]:
# import plotly.express as px

# px.scatter(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     hover_name=netflix_total.title,
#     # size=[0.01]*len(embedding)
# )
