In [1]:
import pandas as pd
import numpy as np
from umap import UMAP
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.random_projection import SparseRandomProjection

sns.set_theme(context="notebook", style="ticks")


### Basic idea
|   |   |   |
|---|---|---|
|   |   |   |
|   |   |   |

Load file

In [2]:
netflix_total = pd.read_csv("netflix_titles.csv")


Remove spaces

In [3]:
netflix_total.cast = netflix_total.cast.str.replace(", ", ",")
netflix_total.director = netflix_total.director.str.replace(", ", ",")
netflix_total.listed_in = netflix_total.listed_in.str.replace(", ", ",")
netflix_total.country = netflix_total.country.str.replace(", ", ",")


Split in TV Shows and Movies

In [4]:
tv = netflix_total[netflix_total.type == "TV Show"]
movies = netflix_total[netflix_total.type == "Movie"]

netflix_total.reset_index(inplace=True)
movies.reset_index(inplace=True)
tv.reset_index(inplace=True)


In [21]:
def preprocess_dataset(data):
    """Main preprocessing steps."""
    data = binarize_categories(data)
    data = idf_vectorize(data)
    knn = fit_knn(data)
    return data, knn


def binarize_categories(data):
    """Convert categorical values in binary feature vectors."""
    data.director[data.director.isna()] = [""]
    data.cast[data.cast.isna()] = [""]
    data.listed_in[data.listed_in.isna()] = [""]
    data.country[data.country.isna()] = [""]
    data.rating[data.rating.isna()] = [""]

    mlb = MultiLabelBinarizer()
    director_binarized = mlb.fit_transform(data.director.str.split(","))
    cast_binarized = mlb.fit_transform(data.cast.str.split(","))
    listed_in_binarized = mlb.fit_transform(data.listed_in.str.split(","))
    countries_binarized = mlb.fit_transform(data.country.str.split(","))
    rating_binarized = mlb.fit_transform(data.rating.str.split(","))

    data_binarized = pd.concat([
        pd.DataFrame(director_binarized),
        pd.DataFrame(cast_binarized),
        pd.DataFrame(listed_in_binarized),
        pd.DataFrame(countries_binarized),
        pd.DataFrame(rating_binarized)],
        axis=1,
        ignore_index=True
    )

    return data_binarized


def idf_vectorize(data):
    """Normalize each feature according to the inverse document frequency."""
    df = data.sum(axis=0)
    idf = np.log(1+len(data) / (1+df)) + 1
    data_vectorized = data * idf.T
    norms = np.linalg.norm(data_vectorized, axis=1)
    data_vectorized = (data_vectorized.T / norms).T

    return data_vectorized


def fit_knn(data):
    knn = NearestNeighbors(
        n_neighbors=11,
        metric="cosine",
        n_jobs=-1
    ).fit(data)

    return knn


In [36]:
def recommend_show(data_reduced, data, query, knn):
    """Compute nearest neighbors and return show/film names from the original dataset."""

    neighbors = knn.kneighbors(data_reduced[data.title == query])
    result = pd.concat([
        pd.DataFrame(
            np.flip(neighbors[0][-1]),
            index=np.flip(neighbors[1][-1]),
            columns=["Score"]),
        data.iloc[np.flip(neighbors[1][-1])][["title", "director", "cast", "country", "listed_in", "type"]]],
        axis=1
    )
    return result.iloc[:10]


In [23]:
data = netflix_total
data_reduced, knn = preprocess_dataset(data)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.director[data.director.isna()] = [""]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.cast[data.cast.isna()] = [""]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.listed_in[data.listed_in.isna()] = [""]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.country[data.country.isna()] = [""]
A va

In [37]:
title = "Event Horizon"
recommend_show(data_reduced, data, title, knn)


Unnamed: 0,Score,title,director,cast,country,listed_in,type
1764,0.880995,Snowden,Oliver Stone,"Joseph Gordon-Levitt,Shailene Woodley,Zachary ...","United Kingdom,France,Germany,United States","Dramas,Thrillers",Movie
6641,0.878894,Dragonheart,Rob Cohen,"Sean Connery,Dennis Quaid,David Thewlis,Pete P...",United States,"Action & Adventure,Sci-Fi & Fantasy",Movie
581,0.876344,Mortal Kombat,Paul W.S. Anderson,"Christophe Lambert,Robin Shou,Linden Ashby,Car...",United States,"Action & Adventure,Sci-Fi & Fantasy",Movie
3517,0.874062,The Crystal Calls Making the Dark Crystal: Age...,Randall Lobb,"Taron Egerton,Natalie Dormer,Simon Pegg,Jason ...",United States,"Documentaries,International Movies",Movie
4626,0.87043,In Darkness,Anthony Byrne,"Natalie Dormer,Ed Skrein,Emily Ratajkowski,Nei...","United Kingdom,United States","Independent Movies,International Movies,Thrillers",Movie
8535,0.868972,The Trigger Effect,David Koepp,"Kyle MacLachlan,Elisabeth Shue,Dermot Mulroney...",United States,Thrillers,Movie
3629,0.85206,Otherhood,,,"United Kingdom,United States",Comedies,Movie
6703,0.83767,Equilibrium,Kurt Wimmer,"Christian Bale,Emily Watson,Taye Diggs,Angus M...",United States,"Action & Adventure,Dramas,Sci-Fi & Fantasy",Movie
6997,0.817919,Horns,Alexandre Aja,"Daniel Radcliffe,Juno Temple,Max Minghella,Joe...","United States,Canada","Dramas,Horror Movies,Sci-Fi & Fantasy",Movie
7857,0.810968,Resident Evil: Afterlife,Paul W.S. Anderson,"Milla Jovovich,Ali Larter,Kim Coates,Shawn Rob...","Germany,France,United States,Canada,United Kin...","Action & Adventure,Horror Movies,Sci-Fi & Fantasy",Movie


### Conclusion
- Works fine for Action/Thriller Films
- Also Comedy?
- TV Shows/Anime seems a bit random (?)
- probably because the style of a Hollywood Blockbuster is more defined by Director/Cast than TV SHows or Anime
- Problem: Missing Data (Directors for TV Shows)

In [25]:
# spr = SparseRandomProjection(
#     n_components=50
# )

# data_reduced = spr.fit_transform(data_reduced)

# mapper = UMAP(
#     n_neighbors=100,
#     metric="correlation",
#     densmap=True
# )

# embedding = mapper.fit_transform(data_reduced)


In [26]:
# %matplotlib widget
# plt.figure(figsize=(6,6))
# sns.scatterplot(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     s=3,
#     # hue=movies.rating

# )


In [27]:
# import plotly.express as px

# px.scatter(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     hover_name=netflix_total.title,
#     # size=[0.01]*len(embedding)
# )
