In [39]:
import pandas as pd
import numpy as np
from umap import UMAP
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.random_projection import SparseRandomProjection

sns.set_theme(context="notebook", style="ticks")


### Basic idea
|   |   |   |
|---|---|---|
|   |   |   |
|   |   |   |

Load file

In [40]:
netflix_total = pd.read_csv("netflix_titles.csv")


Remove spaces

In [41]:
netflix_total.cast = netflix_total.cast.str.replace(", ", ",")
netflix_total.director = netflix_total.director.str.replace(", ", ",")
netflix_total.listed_in = netflix_total.listed_in.str.replace(", ", ",")
netflix_total.country = netflix_total.country.str.replace(", ", ",")


Split in TV Shows and Movies

In [42]:
tv = netflix_total[netflix_total.type == "TV Show"]
movies = netflix_total[netflix_total.type == "Movie"]

netflix_total.reset_index(inplace=True)
movies.reset_index(inplace=True)
tv.reset_index(inplace=True)


In [60]:
def preprocess_dataset(data):
    """Main preprocessing steps."""
    data = binarize_categories(data)
    data = idf_vectorize(data)
    knn = fit_knn(data)
    return data, knn


def binarize_categories(data):
    """Convert categorical values in binary feature vectors."""
    data.director.fillna("", inplace=True)
    data.cast.fillna("", inplace=True)
    data.listed_in.fillna("", inplace=True)
    data.country.fillna("", inplace=True)
    data.rating.fillna("", inplace=True)

    mlb = MultiLabelBinarizer()
    director_binarized = mlb.fit_transform(data.director.str.split(","))
    cast_binarized = mlb.fit_transform(data.cast.str.split(","))
    listed_in_binarized = mlb.fit_transform(data.listed_in.str.split(","))
    countries_binarized = mlb.fit_transform(data.country.str.split(","))
    rating_binarized = mlb.fit_transform(data.rating.str.split(","))

    data_binarized = pd.concat([
        pd.DataFrame(director_binarized),
        pd.DataFrame(cast_binarized),
        pd.DataFrame(listed_in_binarized),
        pd.DataFrame(countries_binarized),
        pd.DataFrame(rating_binarized)],
        axis=1,
        ignore_index=True
    )

    return data_binarized


def idf_vectorize(data):
    """Normalize each feature according to the inverse document frequency."""
    df = data.sum(axis=0)
    idf = np.log((1+len(data)) / (1+df)) + 1
    data_vectorized = data * idf.T
    norms = np.linalg.norm(data_vectorized, axis=1)
    data_vectorized = (data_vectorized.T / norms).T

    return data_vectorized


def fit_knn(data):
    knn = NearestNeighbors(
        n_neighbors=11,
        metric="cosine",
        n_jobs=-1
    ).fit(data)

    return knn


In [62]:
def recommend_show(data_reduced, data, query, knn):
    """Compute nearest neighbors and return show/film names from the original dataset."""

    neighbors = knn.kneighbors(data_reduced[data.title == query])
    result = pd.concat([
        pd.DataFrame(
            np.flip(neighbors[0][-1]),
            index=np.flip(neighbors[1][-1]),
            columns=["Score"]),
        data.iloc[np.flip(neighbors[1][-1])][["title", "director", "cast", "country", "listed_in", "type"]]],
        axis=1
    )
    return result.iloc[:10]


In [61]:
data = netflix_total
data_reduced, knn = preprocess_dataset(data)


In [66]:
title = "Casino Royale"
recommend_show(data_reduced, data, title, knn)


Unnamed: 0,Score,title,director,cast,country,listed_in,type
8413,0.84752,The Mask of Zorro,Martin Campbell,"Antonio Banderas,Anthony Hopkins,Catherine Zet...","United States,Germany,Mexico","Action & Adventure,Romantic Movies",Movie
4530,0.845992,Empire Games,,Jim O'Brien,"Czech Republic,United States",Docuseries,TV Show
880,0.843424,Haunted,,,"United States,Czech Republic","Reality TV,TV Horror,TV Thrillers",TV Show
4170,0.821551,Polar,,,"United States,Germany","Action & Adventure,International Movies",Movie
8496,0.814111,The Salvation,Kristian Levring,"Mads Mikkelsen,Eva Green,Jeffrey Dean Morgan,E...","Denmark,United Kingdom,South Africa,Sweden,Bel...","Dramas,Independent Movies,International Movies",Movie
4476,0.812149,Terrorism Close Calls,,,"United States,Czech Republic","Crime TV Shows,Docuseries,International TV Shows",TV Show
4673,0.812149,Inside the Criminal Mind,,,"United States,Czech Republic","Crime TV Shows,Docuseries,International TV Shows",TV Show
6866,0.809614,GoldenEye,Martin Campbell,"Pierce Brosnan,Sean Bean,Izabella Scorupco,Fam...","United Kingdom,United States",Action & Adventure,Movie
8604,0.803486,Top 10 Secrets and Mysteries,,,"United Kingdom,United States,Czech Republic","British TV Shows,Docuseries,Science & Nature TV",TV Show
7809,0.588926,Quantum of Solace,Marc Forster,"Daniel Craig,Olga Kurylenko,Mathieu Amalric,Ju...","United Kingdom,United States",Action & Adventure,Movie


### Conclusion
- Works fine for Action/Thriller Films
- Also Comedy?
- TV Shows/Anime seems a bit random (?)
- probably because the style of a Hollywood Blockbuster is more defined by Director/Cast than TV SHows or Anime
- Problem: Missing Data (Directors for TV Shows)

In [47]:
# spr = SparseRandomProjection(
#     n_components=50
# )

# data_reduced = spr.fit_transform(data_reduced)

# mapper = UMAP(
#     n_neighbors=100,
#     metric="correlation",
#     densmap=True
# )

# embedding = mapper.fit_transform(data_reduced)


In [48]:
# %matplotlib widget
# plt.figure(figsize=(6,6))
# sns.scatterplot(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     s=3,
#     # hue=movies.rating

# )


In [49]:
# import plotly.express as px

# px.scatter(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     hover_name=netflix_total.title,
#     # size=[0.01]*len(embedding)
# )
