In [128]:
import pandas as pd
import numpy as np
from umap import UMAP
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.random_projection import SparseRandomProjection
from sklearn.compose import ColumnTransformer

sns.set_theme(context="notebook", style="ticks")


### Basic idea
|   |   |   |
|---|---|---|
|   |   |   |
|   |   |   |

Load file

In [129]:
netflix_total = pd.read_csv("netflix_titles.csv")
netflix_total.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


Remove spaces

In [130]:
netflix_total.cast = netflix_total.cast.str.replace(", ", ",")
netflix_total.director = netflix_total.director.str.replace(", ", ",")
netflix_total.listed_in = netflix_total.listed_in.str.replace(", ", ",")
netflix_total.country = netflix_total.country.str.replace(", ", ",")


Split in TV Shows and Movies

In [131]:
tv = netflix_total[netflix_total.type == "TV Show"]
movies = netflix_total[netflix_total.type == "Movie"]

netflix_total.reset_index(inplace=True)
movies.reset_index(inplace=True)
tv.reset_index(inplace=True)


In [132]:
def preprocess_dataset(data):
    """Main preprocessing steps."""
    data = binarize_categories(data)
    data = idf_vectorize(data)
    knn = fit_knn(data)
    return data, knn


def binarize_categories(data):
    """Convert categorical values in binary feature vectors."""
    oh_enc = OneHotEncoder(sparse=False, dtype="int")
    data_binarized = oh_enc.fit_transform(
        data[["director", "cast", "listed_in", "country", "rating"]])

    return data_binarized


def idf_vectorize(data):
    """Normalize each feature according to the inverse document frequency."""
    df = data.sum(axis=0)
    idf = np.log((1+len(data)) / (1+df)) + 1
    data_vectorized = data * idf.T
    return data_vectorized


def fit_knn(data):
    """Create knn instance and fit on films."""
    knn = NearestNeighbors(
        n_neighbors=11,
        metric="cosine",
        n_jobs=-1
    )

    return knn.fit(data)


In [133]:
def recommend_show(data_reduced, data, query, knn):
    """Compute nearest neighbors and return show/film names from the original dataset."""

    neighbors = knn.kneighbors(data_reduced[data.title == query])
    result = pd.concat([
        pd.DataFrame(
            np.flip(neighbors[0][-1]),
            index=np.flip(neighbors[1][-1]),
            columns=["Score"]),
        data.iloc[np.flip(neighbors[1][-1])][["title", "director", "cast", "country", "listed_in", "type"]]],
        axis=1
    )
    return result.iloc[:10]


In [134]:
data = netflix_total
data_reduced, knn = preprocess_dataset(data)


In [139]:
title = "Inception"
recommend_show(data_reduced, data, title, knn)


Unnamed: 0,Score,title,director,cast,country,listed_in,type
724,0.825706,The Reason I Jump,Jerry Rothwell,,"United States,United Kingdom",Documentaries,Movie
7571,0.824135,Nature's Great Events (2009),,David Attenborough,"United States,United Kingdom","British TV Shows,Docuseries,Science & Nature TV",TV Show
7572,0.824135,Nature's Great Events: Diaries,,David Attenborough,"United States,United Kingdom","British TV Shows,Docuseries,Science & Nature TV",TV Show
1251,0.823413,Dances with Wolves,Kevin Costner,"Kevin Costner,Mary McDonnell,Rodney Grant,Grah...","United States,United Kingdom","Action & Adventure,Classic Movies,Dramas",Movie
3938,0.820929,Our Planet,,David Attenborough,"United States,United Kingdom","Docuseries,Science & Nature TV",TV Show
752,0.820009,Vampire Academy,Mark Waters,"Zoey Deutch,Lucy Fry,Danila Kozlovsky,Gabriel ...","United States,United Kingdom","Action & Adventure,Comedies,Sci-Fi & Fantasy",Movie
1916,0.814076,Rize,David LaChapelle,"Tommy the Clown,Tight Eyez","United States,United Kingdom","Documentaries,Music & Musicals",Movie
327,0.810673,Beowulf,Robert Zemeckis,"Ray Winstone,Anthony Hopkins,John Malkovich,An...","United States,United Kingdom","Action & Adventure,Sci-Fi & Fantasy",Movie
1144,0.809569,Yes Man,Peyton Reed,"Jim Carrey,Zooey Deschanel,Bradley Cooper,John...","United States,United Kingdom","Comedies,Romantic Movies",Movie
8101,0.809569,Stranger than Fiction,Marc Forster,"Will Ferrell,Maggie Gyllenhaal,Dustin Hoffman,...","United States,United Kingdom","Comedies,Romantic Movies",Movie


### Conclusion
- Works fine for Action/Thriller Films
- Also Comedy?
- TV Shows/Anime seems a bit random (?)
- probably because the style of a Hollywood Blockbuster is more defined by Director/Cast than TV SHows or Anime
- Problem: Missing Data (Directors for TV Shows)

In [136]:
# spr = SparseRandomProjection(
#     n_components=50
# )

# data_reduced = spr.fit_transform(data_reduced)

# mapper = UMAP(
#     n_neighbors=100,
#     metric="correlation",
#     densmap=True
# )

# embedding = mapper.fit_transform(data_reduced)


In [137]:
# %matplotlib widget
# plt.figure(figsize=(6,6))
# sns.scatterplot(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     s=3,
#     # hue=movies.rating

# )


In [138]:
# import plotly.express as px

# px.scatter(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     hover_name=netflix_total.title,
#     # size=[0.01]*len(embedding)
# )
