In [None]:
from matplotlib import pyplot as plt
import os
import json
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import umap.umap_ as umap

from sentence_transformers import SentenceTransformer

%matplotlib ipympl

In [None]:
with open(os.path.join("..", "data", "channels_info.json"), "r") as f:
    channels_info = json.load(f)
channels = [c for c,dic in channels_info.items() if dic["Country"] in ["united-states", "united-kingdom", "australia", "netherlands"]]
print(len(channels))
# channels_info

In [None]:
with open(os.path.join("..", "data", "videos_info.json"), "r") as f:
    videos_info_per_channel = json.load(f)
for c in videos_info_per_channel:
    for i in range(len(videos_info_per_channel[c])):
        vid = videos_info_per_channel[c][i]
        videos_info_per_channel[c][i]["views_per_sub"] = vid["views"] / channels_info[c]["Subscribers"]
        videos_info_per_channel[c][i]["relative_views_30"] = vid["views"] / np.mean([v["views"] for v in videos_info_per_channel[c]])
        videos_info_per_channel[c][i]["relative_views_to_max_30"] = vid["views"] / np.max([v["views"] for v in videos_info_per_channel[c]])
        videos_info_per_channel[c][i]["relative_views_to_min_30"] = vid["views"] / np.min([v["views"] for v in videos_info_per_channel[c]])
        # videos_info_per_channel[c][i]["relative_views_total"] = vid["views"] / channels_info[c]["Video views"]
channels = list(set(videos_info_per_channel.keys()).intersection(channels))
videos_info = [video_info for c in channels for video_info in videos_info_per_channel[c]]
titles = [video_info["title"] for video_info in videos_info]
print(len(videos_info))

Encoding names

In [None]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
title_encodings = sbert_model.encode(titles)
title_to_vec = {k:v for k,v in zip(titles, title_encodings)}

In [None]:
print(title_encodings.shape)

Dimension reduction

In [None]:
# np.concatenate((title_encodings, np.array([video_info["views"] for video_info in videos_info]).reshape(-1,1)), axis=-1)
unit_title_encodings = StandardScaler().fit_transform(title_encodings)

reducer_type = "UMAP"

if reducer_type == "PCA":
    # XY
    pca = PCA(n_components=2)
    title_XY = pca.fit_transform(unit_title_encodings)
    print(pca.explained_variance_ratio_)
    print(pca.singular_values_)
    # XYZ
    pca = PCA(n_components=3)
    title_XYZ = pca.fit_transform(unit_title_encodings)
    print(pca.explained_variance_ratio_)
    print(pca.singular_values_)
elif reducer_type == "UMAP":
    min_dist, n_neighbors = .0001, 15
    reducer = umap.UMAP(min_dist=min_dist, n_components=2, n_neighbors=n_neighbors, verbose=True)
    reducer.fit(unit_title_encodings)
    title_XY = reducer.transform(unit_title_encodings)
    reducer = umap.UMAP(min_dist=min_dist, n_components=3, n_neighbors=n_neighbors, verbose=True)
    reducer.fit(unit_title_encodings)
    title_XYZ = reducer.transform(unit_title_encodings)

title_to_xy = {k:v for k,v in zip(titles, title_XY)}
title_to_xyz = {k:v for k,v in zip(titles, title_XYZ)}

Plotting

In [None]:
fig, ax = plt.subplots(1,1, figsize=(6,6))

X = [title_to_xy[title][0] for title in titles]
Y = [title_to_xy[title][1] for title in titles]
c = [np.log(dic["views"]) for dic in videos_info]
s = [np.sqrt(dic["views"])/100 for dic in videos_info]

# c = [np.log(dic["relative_views_30"]) for dic in videos_info]
# s = [np.sqrt(dic["relative_views_30"])*50 for dic in videos_info]

# c = [np.log(dic["relative_views_to_max_30"]) for dic in videos_info]
# s = [np.sqrt(dic["relative_views_to_max_30"])*100 for dic in videos_info]

# c = [np.log(dic["relative_views_to_min_30"]) for dic in videos_info]
# s = [np.sqrt(dic["relative_views_to_min_30"])*10 for dic in videos_info]

# c = [np.log(dic["views_per_sub"]) for dic in videos_info]
# s = [np.sqrt(dic["views_per_sub"])*100 for dic in videos_info]

indices = np.random.randint(0, len(X), (0))
# indices = np.arange(30)

for i in indices:
    ax.text(X[i], Y[i], list(titles)[i], fontsize=7)

mappable = ax.scatter(X, Y, c=c, s=s, alpha=.5,
    # vmin=-1.5, vmax=1.5,
    # vmin=-2, vmax=0,
    # vmin=0, vmax=3,
)
ax.axis("Off")

# plt.colorbar(mappable)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(5,5), subplot_kw=dict(projection='3d'))

X = np.array([title_to_xyz[title][0] for title in titles])
Y = np.array([title_to_xyz[title][1] for title in titles])
Z = np.array([title_to_xyz[title][2] for title in titles])
# Z = np.array([np.log(dic["views"]) for dic in videos_info])
c = np.array([np.log(dic["views"]) for dic in videos_info])
s = np.array([np.sqrt(dic["views"])/100 for dic in videos_info])
# s = 10
# c = [dic["views"] for dic in videos_info]

# indices = np.random.randint(0, len(X), (20))
# print(indices)
indices = np.arange(len(X))

# for i in indices:
#     ax.text(X[i], Y[i], Z[i], list(titles)[i], fontsize=7)


ax.scatter(X[indices], Y[indices], Z[indices], c=c[indices], s=s[indices], alpha=.5)
# ax.axis("Off")

Channel representation

In [None]:
channel_encodings = [np.mean([title_to_vec[v["title"]] for v in videos_info_per_channel[c]], axis=0) for c in channels]
channel_to_vec = {name: vec for name,vec in zip(channels, channel_encodings)}

In [None]:
unit_channel_encodings = StandardScaler().fit_transform(channel_encodings)

if reducer_type == "PCA":
    # XY
    pca = PCA(n_components=2)
    channel_XY = pca.fit_transform(unit_channel_encodings)
    print(pca.explained_variance_ratio_)
    print(pca.singular_values_)
    # XYZ
    pca = PCA(n_components=3)
    channel_XYZ = pca.fit_transform(unit_channel_encodings)
    print(pca.explained_variance_ratio_)
    print(pca.singular_values_)
elif reducer_type == "UMAP":
    min_dist, n_neighbors = .01, 7
    reducer = umap.UMAP(min_dist=min_dist, n_components=2, n_neighbors=n_neighbors, verbose=True)
    reducer.fit(unit_channel_encodings)
    channel_XY = reducer.transform(unit_channel_encodings)
    reducer = umap.UMAP(min_dist=min_dist, n_components=3, n_neighbors=n_neighbors, verbose=True)
    reducer.fit(unit_channel_encodings)
    channel_XYZ = reducer.transform(unit_channel_encodings)

channel_to_xy = {k:v for k,v in zip(channel_to_vec.keys(), channel_XY)}
channel_to_xyz = {k:v for k,v in zip(channel_to_vec.keys(), channel_XYZ)}

In [None]:
plt.close("all")
fig, ax = plt.subplots(1,1, figsize=(6,6))

X = [channel_to_xy[name][0] for name in channel_to_vec]
Y = [channel_to_xy[name][1] for name in channel_to_vec]

for i, x in enumerate(X[:]):
    ax.text(X[i], Y[i], list(channel_to_vec.keys())[i], fontsize=7)

c = [np.log(channels_info[name]["Subscribers"]) for name in channels]
s = [(channels_info[name]["Subscribers"])/50000 for name in channels]

avg_views = {c: np.mean([vid["views"] for vid in videos_info_per_channel[c]]) for c in channels}
c = [np.log(avg_views[name]) for name in channels]
s = [np.sqrt(avg_views[name])/5 for name in channels]

# avg_views_per_sub = {c: avg_views[c]/channels_info[c]["Subscribers"] for c in channels}
# c = [np.log(avg_views_per_sub[name]) for name in channels]
# s = [np.sqrt(avg_views_per_sub[name])*1000 for name in channels]

ax.scatter(X, Y, c=c, s=s, alpha=.5)

In [None]:
plt.close("all")
fig, ax = plt.subplots(1,1, figsize=(6,6), subplot_kw=dict(projection='3d'))

X = [channel_to_xyz[name][0] for name in channel_to_vec]
Y = [channel_to_xyz[name][1] for name in channel_to_vec]
Z = [channel_to_xyz[name][2] for name in channel_to_vec]

for i, x in enumerate(X[:]):
    ax.text(X[i], Y[i], Z[i], list(channel_to_vec.keys())[i], fontsize=7)

ax.scatter(X, Y, Z, c=c, s=s, alpha=.5)

In [None]:
name = channels[15]
print("Channel name: ", name)
difference = (np.abs(title_encodings - channel_to_vec[name])**2).sum(-1)

print("TITLES SORTED BY REPRESENTATIVENESS:")
for idx in np.argsort(difference):
    print(titles[idx])