In [281]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from umap import UMAP

sns.set_theme(context="notebook", style="ticks")

### Basic idea

|     |     |     |
| --- | --- | --- |
|     |     |     |
|     |     |     |


Load file


In [282]:
netflix_total = pd.read_csv("netflix_titles.csv")
netflix_total.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


Remove spaces


In [283]:
netflix_total.cast = netflix_total.cast.str.replace(", ", ",")
netflix_total.director = netflix_total.director.str.replace(", ", ",")
netflix_total.listed_in = netflix_total.listed_in.str.replace(", ", ",")
netflix_total.country = netflix_total.country.str.replace(", ", ",")

Split in TV Shows and Movies


In [284]:
tv = netflix_total[netflix_total.type == "TV Show"]
movies = netflix_total[netflix_total.type == "Movie"]

netflix_total.reset_index(inplace=True)
movies.reset_index(inplace=True)
tv.reset_index(inplace=True)

In [285]:
def preprocess_dataset(data):
    """Main preprocessing steps."""
    data, feature_names = binarize_categories(data)
    data = idf_vectorize(data)
    knn = fit_knn(data)
    return data, knn, feature_names


def binarize_categories(data):
    """Convert categorical values in binary feature vectors."""
    mlb = MultiLabelBinarizer()
    for col in ["director", "cast", "listed_in", "country", "rating"]:
        data[col] = data[col].str.split(",")

    col_transformer = ColumnTransformer(
        [
            ("director", mlb, ["director"]),
            ("cast", mlb, ["cast"]),
            ("listed_in", mlb, ["listed_in"]),
            ("country", mlb, ["country"]),
            ("rating", mlb, ["rating"]),
        ]
    )

    data_binarized = col_transformer.fit_transform(X=data)
    # feature_names = mlb.get_feature_names_out()

    return data_binarized, feature_names


def idf_vectorize(data):
    """Normalize each feature according to the inverse document frequency."""
    df = data.sum(axis=0)
    idf = np.log((1 + len(data)) / (1 + df)) + 1
    data_vectorized = data * idf.T
    return data_vectorized


def fit_knn(data):
    """Create knn instance and fit on films."""
    knn = NearestNeighbors(n_neighbors=11, metric="cosine", n_jobs=-1)

    return knn.fit(data)

In [286]:
def recommend_show(data_reduced, data, query, knn):
    """Compute nearest neighbors and return show/film names from the original dataset."""

    neighbors = knn.kneighbors(data_reduced[data.title == query])
    result = pd.concat(
        [
            pd.DataFrame(
                np.flip(neighbors[0][-1]),
                index=np.flip(neighbors[1][-1]),
                columns=["Score"],
            ),
            data.iloc[np.flip(neighbors[1][-1])][
                ["title", "director", "cast", "country", "listed_in", "type"]
            ],
        ],
        axis=1,
    )
    return result.iloc[:10]

In [287]:
mlb = MultiLabelBinarizer()
for col in ["director", "cast", "listed_in", "country", "rating"]:
    netflix_total[col] = netflix_total[col].str.split(",")

netflix_total.fillna("")

col_transformer = ColumnTransformer(
    [
        ("cast", mlb, ["cast"]),
        # ("cast", mlb, ["cast"]),
        # ("listed_in", mlb, ["listed_in"]),
        # ("country", mlb, ["country"]),
        # ("rating", mlb, ["rating"]),
    ]
)

In [288]:
col_transformer

ColumnTransformer(transformers=[('cast', MultiLabelBinarizer(), ['cast'])])

In [289]:
col_transformer.fit_transform(netflix_total)

TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [None]:
data = netflix_total
data_reduced, knn, feature_names = preprocess_dataset(data)

TypeError: fit_transform() takes 2 positional arguments but 3 were given

In [None]:
data_reduced

array([[0.        , 0.        , 1.40546511, 1.69314718, 1.69314718,
        0.        , 1.40546511, 0.        , 0.        , 1.69314718,
        1.40546511, 0.        , 1.        , 0.        , 0.        ],
       [0.        , 1.69314718, 1.40546511, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.69314718, 1.        , 0.        , 0.        ],
       [2.09861229, 0.        , 0.        , 1.69314718, 1.69314718,
        0.        , 1.40546511, 2.09861229, 1.40546511, 0.        ,
        0.        , 1.69314718, 1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.40546511, 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.40546511, 1.69314718,
        1.40546511, 0.        , 1.        , 2.09861229, 2.09861229],
       [0.        , 1.69314718, 0.        , 0.        , 0.        ,
        2.09861229, 1.40546511, 0.        , 1.40546511, 0.        ,
        1.40546511, 0.        , 1.        , 

In [None]:
title = "Inception"
recommend_show(data_reduced, data, title, knn)

IndexError: boolean index did not match indexed array along dimension 0; dimension is 5 but corresponding boolean dimension is 8807

### Conclusion

- Works fine for Action/Thriller Films
- Also Comedy?
- TV Shows/Anime seems a bit random (?)
- probably because the style of a Hollywood Blockbuster is more defined by Director/Cast than TV SHows or Anime
- Problem: Missing Data (Directors for TV Shows)


In [None]:
# spr = SparseRandomProjection(
#     n_components=50
# )

# data_reduced = spr.fit_transform(data_reduced)

# mapper = UMAP(
#     n_neighbors=100,
#     metric="correlation",
#     densmap=True
# )

# embedding = mapper.fit_transform(data_reduced)

In [None]:
# %matplotlib widget
# plt.figure(figsize=(6,6))
# sns.scatterplot(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     s=3,
#     # hue=movies.rating

# )

In [None]:
# import plotly.express as px

# px.scatter(
#     x=embedding[:,0],
#     y=embedding[:,1],
#     hover_name=netflix_total.title,
#     # size=[0.01]*len(embedding)
# )