In [2]:
import pandas as pd
import numpy as np
from umap import UMAP
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.random_projection import SparseRandomProjection

sns.set_theme(context="notebook", style="ticks")


### Basic idea
|   |   |   |
|---|---|---|
|   |   |   |
|   |   |   |

Load file

In [3]:
netflix_total = pd.read_csv("netflix_titles.csv")


Remove spaces

In [4]:
netflix_total.cast = netflix_total.cast.str.replace(", ", ",")
netflix_total.director = netflix_total.director.str.replace(", ", ",")
netflix_total.listed_in = netflix_total.listed_in.str.replace(", ", ",")
netflix_total.country = netflix_total.country.str.replace(", ", ",")


Split in TV Shows and Movies

In [5]:
tv = netflix_total[netflix_total.type == "TV Show"]
movies = netflix_total[netflix_total.type == "Movie"]

netflix_total.reset_index(inplace=True)
movies.reset_index(inplace=True)
tv.reset_index(inplace=True)

In [90]:
def preprocess_dataset(data):
    """Main preprocessing steps."""
    data = binarize_categories(data)
    data = idf_vectorize(data)
    knn = fit_knn(data)
    return data, knn


def binarize_categories(data):
    """Convert categorical values in binary feature vectors."""
    data.director[data.director.isna()] = [""]
    data.cast[data.cast.isna()] = [""]
    data.listed_in[data.listed_in.isna()] = [""]
    data.country[data.country.isna()] = [""]
    data.rating[data.rating.isna()] = [""]

    mlb = MultiLabelBinarizer()
    director_binarized = mlb.fit_transform(data.director.str.split(","))
    cast_binarized = mlb.fit_transform(data.cast.str.split(","))
    listed_in_binarized = mlb.fit_transform(data.listed_in.str.split(","))
    countries_binarized = mlb.fit_transform(data.country.str.split(","))
    rating_binarized = mlb.fit_transform(data.rating.str.split(","))

    data_binarized = pd.concat([
        pd.DataFrame(director_binarized),
        pd.DataFrame(cast_binarized),
        pd.DataFrame(listed_in_binarized),
        pd.DataFrame(countries_binarized),
        pd.DataFrame(rating_binarized)],
        axis=1,
        ignore_index=True
    )

    return data_binarized


def idf_vectorize(data):
    """Normalize each feature according to the inverse document frequency."""
    df = data.sum(axis=0)
    idf = np.log(1+len(data) / 1+df) + 1
    data_vectorized = data * idf.T
    norms = np.linalg.norm(data_vectorized, axis=1)
    data_vectorized = (data_vectorized.T / norms).T

    return data_vectorized

def fit_knn(data):
    knn = NearestNeighbors(
        n_neighbors=11,
        metric="correlation"
    ).fit(data)

    return knn


In [103]:
def recommend_show(data_reduced, data, query, knn):
    """Compute nearest neighbors in low dimensional space and return show/film names from the original dataset."""

    neighbors = knn.kneighbors(data_reduced[data.title == query])
    result = pd.concat([
        pd.DataFrame(
            np.flip(neighbors[0][-1]), 
            index=np.flip(neighbors[1][-1]), 
            columns=["Score"]),
        data.iloc[np.flip(neighbors[1][-1])][["title", "director", "cast", "country", "listed_in", "type"]]],
        axis=1
    )
    # ).iloc[1:10]
    # print(result)
    return result


In [92]:
data = netflix_total
data_reduced, knn = preprocess_dataset(data)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.director[data.director.isna()] = [""]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.cast[data.cast.isna()] = [""]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.listed_in[data.listed_in.isna()] = [""]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.country[data.country.isna()] = [""]
A va

(18,)


In [109]:
title = "Rocky"
recommend_show(data_reduced, data, title, knn)


Unnamed: 0,Score,title,director,cast,country,listed_in,type
7530,0.706968,Mutiny on the Bounty,"Lewis Milestone,Carol Reed","Marlon Brando,Trevor Howard,Richard Harris,Hug...",United States,"Action & Adventure,Classic Movies,Dramas",Movie
602,0.681166,The Karate Kid Part II,John G. Avildsen,"Ralph Macchio,Pat Morita,Nobu McCarthy,Tamlyn ...",United States,"Action & Adventure,Children & Family Movies,Dr...",Movie
1669,0.676125,If Anything Happens I Love You,"Will McCormack,Michael Govier",,United States,Dramas,Movie
41,0.671155,Jaws,Steven Spielberg,"Roy Scheider,Robert Shaw,Richard Dreyfuss,Lorr...",United States,"Action & Adventure,Classic Movies,Dramas",Movie
6496,0.668847,Cleopatra Jones,Jack Starrett,"Tamara Dobson,Bernie Casey,BrendaSykes,Esther ...",United States,"Action & Adventure,Classic Movies",Movie
601,0.651539,The Karate Kid,John G. Avildsen,"Ralph Macchio,Pat Morita,Elisabeth Shue,Martin...",United States,"Action & Adventure,Children & Family Movies,Cl...",Movie
7883,0.495408,Rocky V,John G. Avildsen,"Sylvester Stallone,Talia Shire,Burt Young,Rich...",United States,"Dramas,Sports Movies",Movie
7882,0.49488,Rocky IV,Sylvester Stallone,"Sylvester Stallone,Talia Shire,Burt Young,Carl...",United States,"Dramas,Sports Movies",Movie
7881,0.44472,Rocky III,Sylvester Stallone,"Sylvester Stallone,Talia Shire,Burt Young,Carl...",United States,"Dramas,Sports Movies",Movie
7880,0.397322,Rocky II,Sylvester Stallone,"Sylvester Stallone,Talia Shire,Burt Young,Carl...",United States,"Dramas,Sports Movies",Movie


### Conclusion
- Works fine for Action/Thriller Films
- Also Comedy?
- TV SHows/Anime seems a bit random (?)
- probably because the style of a Hollywood Blockbuster is more defined by Director/Cast than TV SHows or Anime
- Problem: Missing Data (Directors for TV Shows)

In [11]:
# spr = SparseRandomProjection(
#     n_components=50
# )

# data_reduced = spr.fit_transform(data_reduced)

# mapper = UMAP(
#     n_neighbors=100,
#     metric="correlation",
#     densmap=True
# )

# embedding = mapper.fit_transform(data_reduced)

In [12]:
# %matplotlib widget
# plt.figure(figsize=(6,6))
# sns.scatterplot(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     s=3,
#     # hue=movies.rating

# )


In [13]:
# import plotly.express as px

# px.scatter(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     hover_name=netflix_total.title,
#     # size=[0.01]*len(embedding)
# )