In [None]:
from matplotlib import pyplot as plt
import os
import json
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sentence_transformers import SentenceTransformer

%matplotlib ipympl

In [None]:
with open(os.path.join("..", "data", "videos_info.json"), "r") as f:
    videos_info_per_channel = json.load(f)
videos_info = [video_info for vids_info in videos_info_per_channel.values() for video_info in vids_info]
titles = [video_info["title"] for video_info in videos_info]

In [None]:
print(len(videos_info))

Encoding names

In [None]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
title_encodings = sbert_model.encode(titles)
title_to_vec = {k:v for k,v in zip(titles, title_encodings)}

In [None]:
print(title_encodings.shape)

Dimension reduction

In [None]:
# np.concatenate((title_encodings, np.array([video_info["views"] for video_info in videos_info]).reshape(-1,1)), axis=-1)
unit_title_encodings = StandardScaler().fit_transform(title_encodings)

# XY
pca = PCA(n_components=2)
title_XY = pca.fit_transform(unit_title_encodings)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

title_to_xy = {k:v for k,v in zip(titles, title_XY)}

# XYZ
pca = PCA(n_components=3)
title_XYZ = pca.fit_transform(unit_title_encodings)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

title_to_xyz = {k:v for k,v in zip(titles, title_XYZ)}

Plotting

In [None]:
fig, ax = plt.subplots(1,1, figsize=(4,4))

X = [title_to_xy[title][0] for title in titles]
Y = [title_to_xy[title][1] for title in titles]
c = [np.log(dic["views"]) for dic in videos_info]
s = [np.sqrt(dic["views"])/100 for dic in videos_info]
# s = 50
ax.scatter(X, Y, c=c, s=s, alpha=.5)
ax.axis("Off")

In [None]:
fig, ax = plt.subplots(1,1, figsize=(5,5), subplot_kw=dict(projection='3d'))

X = np.array([title_to_xyz[title][0] for title in titles])
Y = np.array([title_to_xyz[title][1] for title in titles])
Z = np.array([title_to_xyz[title][2] for title in titles])
# Z = np.array([np.log(dic["views"]) for dic in videos_info]
c = np.array([np.log(dic["views"]) for dic in videos_info])
s = np.array([np.sqrt(dic["views"])/100 for dic in videos_info])
# s = 10
# c = [dic["views"] for dic in videos_info]

indices = np.random.randint(0, len(X), (20))
indices = np.arange(30)

for i in indices:
    ax.text(X[i], Y[i], Z[i], list(titles)[i], fontsize=7)

print(indices)

ax.scatter(X[indices], Y[indices], Z[indices], c=c[indices], s=s[indices], alpha=.5)
# ax.axis("Off")

Channel representation

In [None]:
with open(os.path.join("..", "data", "channels_info.json"), "r") as f:
    channels_info = json.load(f)

In [None]:
channel_encodings = [np.mean([title_to_vec[v["title"]] for v in vids], axis=0) for vids in videos_info_per_channel.values()]
channel_to_vec = {name: vec for name,vec in zip(videos_info_per_channel.keys(), channel_encodings)}

In [None]:
unit_channel_encodings = StandardScaler().fit_transform(channel_encodings)

# XY
pca = PCA(n_components=2)
channel_XY = pca.fit_transform(unit_channel_encodings)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

channel_to_xy = {k:v for k,v in zip(channel_to_vec.keys(), channel_XY)}

# XYZ
pca = PCA(n_components=3)
channel_XYZ = pca.fit_transform(unit_channel_encodings)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

channel_to_xyz = {k:v for k,v in zip(channel_to_vec.keys(), channel_XYZ)}

In [None]:
plt.close("all")
fig, ax = plt.subplots(1,1, figsize=(10,10), subplot_kw=dict(projection='3d'))

X = [channel_to_xyz[name][0] for name in channel_to_vec]
Y = [channel_to_xyz[name][1] for name in channel_to_vec]
Z = [channel_to_xyz[name][2] for name in channel_to_vec]

for i, x in enumerate(X):
    ax.text(X[i], Y[i], Z[i], list(channel_to_vec.keys())[i], fontsize=7)
c = [np.log(channels_info[name]["Subscribers"]) for name in channel_to_vec]
s = [np.sqrt(channels_info[name]["Subscribers"])/50 for name in channel_to_vec]
ax.scatter(X, Y, Z, c=c, s=s, alpha=.5)

In [None]:
name = list(channel_to_vec.keys())[1]
print("Channel name: ", name)
difference = np.abs(title_encodings - channel_to_vec[name]).sum(-1)

print("TITLES SORTED BY REPRESENTATIVENESS:")
for idx in np.argsort(difference):
    print(titles[idx])