In [1]:
import numpy as np
import pandas as pd
from gensim.models import LdaMulticore
from helper_data import *
from helper_preprocess import *
from gensim.test.utils import datapath

%load_ext autoreload
%autoreload 2

## Chargement des données

In [2]:
# Load data
walks_folder = 'data/P5'
#data = load_all_walks_comments(walks_folder, True)
data = load_all_walks_tags(walks_folder, True)

In [3]:
data.shape

(2591, 15)

In [4]:
data = data.replace('None', np.nan)

In [5]:
mapping = {v: k for k, v in enumerate(data['walk'].unique().tolist())}
data['walk'] = data['walk'].apply(lambda x: mapping[x])

In [6]:
# Drop the videos where nan occurs
data = data.dropna(subset='keywords')
tags = data['keywords'].tolist()
walks_id = data['walk'].tolist()

In [7]:
len(tags)

2374

## Preprocessing

In [8]:
# Preprocess the data
tags_prep = [preprocess(t, tag=True) for t in tags]

In [9]:
# Filter out words that appear in more than 30% of the videos and words that
# appear less than 3 times
min_words = 3
max_freq = 0.3

dictionary, corpus = create_corpus(tags_prep, min_words, max_freq)

In [10]:
len(corpus)

2374

## Topics Modeling : LDA

In [24]:
# LDA parameters
workers = 3
passes = 1000
random_state = 123
num_topics = 20
alpha = 0.05 # < 1 gives sparse topics distributions
eta = 'auto' #0.05 # < 1 gives sparse words distribution inside each topic

lda_model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, workers=workers, passes=passes, random_state=random_state, alpha=alpha, eta=eta)

# Save the model
tmp_file = datapath('lda_model')
lda_model.save(tmp_file)

# Load model
#lda_model = LdaMulticore.load(tmp_file)

In [25]:
lda_model.show_topics(formatted=False, num_topics=num_topics, num_words=10)

[(0,
  [('free', 0.10918209),
   ('camera', 0.10340812),
   ('phone', 0.096383885),
   ('new', 0.05572895),
   ('york', 0.016945742),
   ('factory', 0.014098272),
   ('university', 0.01299128),
   ('train', 0.012091737),
   ('city', 0.012088214),
   ('spacex', 0.011088282)]),
 (1,
  [('music', 0.22699352),
   ('relaxing', 0.03162622),
   ('work', 0.030936139),
   ('classical', 0.023912508),
   ('best', 0.021995904),
   ('holt', 0.017290913),
   ('study', 0.017203778),
   ('speech', 0.015676802),
   ('studying', 0.015143431),
   ('motivational', 0.014618548)]),
 (2,
  [('crash', 0.07404378),
   ('air', 0.038574163),
   ('space', 0.037751295),
   ('investigation', 0.034324165),
   ('flight', 0.03087712),
   ('plane', 0.02632396),
   ('history', 0.0251378),
   ('harrier', 0.021186786),
   ('fear', 0.019347854),
   ('flying', 0.018459195)]),
 (3,
  [('comedy', 0.05913045),
   ('greatest', 0.03674382),
   ('hit', 0.034114596),
   ('short', 0.027668571),
   ('film', 0.023725387),
   ('securi

In [340]:
# Interpret the topics
topics_name = ['Phones/Cameras', 'Music', 'Airplane incident', 'Comedy', 'News', 'Food', 'War', 'Johnny Depp / Amber Heard', 'International news', 'Human rights', 'International/Foreign Policy', ''] #['Education & Health', 'Daily TV', 'Countries', 'x Got Talent', 'Top News', 'Airplane accident', 'Motivation', 'Royal Family', 'Business & Industry', 'Johnny Depp / Amber Heard', '???', 'New York', 'Ted Talks', 'Fox News', 'Documentaries & History', 'Music', 'Ukraine - Russia War', 'International affairs on war', 'Investing', 'Music']
topic_id2name = {i: topics_name[i] for i in range(num_topics)}
topic_name2id = {v: k for k, v in topic_id2name.items()}

## Visualisations des vidéos : t-SNE sur les distributions des topics

Nous utilisons le vecteur de la distribution des topics de chaque vidéo ainsi que l'algorithme t-SNE afin de les représenter en 2D.

In [341]:
# Build the "embeddings" for each document using its topics distribution
embeddings = np.zeros((len(corpus), num_topics))
for i, doc_bow in enumerate(corpus):
    topics_distr = lda_model.get_document_topics(doc_bow)
    embedding = [0.0] * num_topics

    for id, p in topics_distr:
        embedding[id] = p

    embeddings[i, :] = embedding

In [342]:
from sklearn.manifold import TSNE

# Init pca is more stable,
tsne = TSNE(n_components=2, init='pca', learning_rate='auto', random_state=random_state)
corpus_emb = tsne.fit_transform(embeddings)


The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



Le topic le plus pertinent est attribué à chaque vidéo.

In [343]:
def val2rgb(val):
    return rgb2hex(cmap(val)[:3])

xs, ys = corpus_emb[:, 0], corpus_emb[:, 1]
best_topic_ids = [np.argmax(row) for row in embeddings.tolist()]

# We will need to separate runs, here we have all runs of P3
colors = [val2rgb(i) for i in range(len(xs))]

In [344]:
# Use pandas and plotly instead
df = pd.DataFrame({'x': corpus_emb[:, 0], 'y': corpus_emb[:, 1], 'best_topic_id': best_topic_ids, 'walk_id': [str(i) for i in walks_id], 'color': colors})

In [345]:
import plotly.express as px

fig = px.scatter(df, x='x', y='y', color='walk_id', hover_data=['best_topic_id'], width=800, height=800, labels={'walk_id': 'Run'})
fig.update_layout(
    title='Videos embeddings by run'
)
fig.show()

Ajout de ces nouvelles données au dataframe original.

In [346]:
df_embeddings = pd.DataFrame(embeddings, columns=[f'topic_id_{i}' for i in range(embeddings.shape[1])])
df_embeddings['video_best_topic_id'] = df_embeddings.apply(lambda row: np.argmax(row), axis=1)
df_embeddings['video_best_topic'] = df_embeddings['video_best_topic_id'].apply(lambda id: topic_id2name[id])
df_output = pd.concat([data.reset_index(drop=True), df_embeddings], axis=1)

In [347]:
# Only keep the first topic if there are multiple modes for a walk
df_walk_best_topic = df_output[['walk', 'video_best_topic_id']].groupby('walk').agg(lambda  x: pd.Series.mode(x)[0]).rename(columns={'video_best_topic_id': 'walk_best_topic_id'})
df_walk_best_topic['walk_best_topic'] = df_walk_best_topic['walk_best_topic_id'].apply(lambda id: topic_id2name[id])
df_output = df_output.join(df_walk_best_topic, on='walk', how='left')

# Visualizations + Basic statistics for website

In [348]:
df_output['x'] = corpus_emb[:, 0]
df_output['y'] = corpus_emb[:, 1]

In [349]:
all_topics = df_output['video_best_topic'].unique().tolist()

In [350]:
from matplotlib.colors import rgb2hex
import matplotlib.pyplot as plt

cmap = plt.get_cmap('magma')
def val2rgb(val):
    return rgb2hex(cmap(val)[:3])

def str2color(txt):
    return rgb2hex(cmap(topic_name2id[str(txt)])[:3])

## Statistiques de base

### Durée des vidéos au fur et à mesure du run

In [351]:
df_plot = df_output[~df_output['walk'].isnull()]
df_plot = df_plot[~df_plot['video_duration'].isnull()]
fig = px.line(df_plot, x='video_id_in_run', y='video_duration',
              color=df_plot['walk'], title='Durée des vidéos durant le run',
              log_y=True,
              labels={'video_duration': 'Durée [s]', 'video_id_in_run': 'Position dans le run', 'theme': 'Thème', 'walk': 'Run'},
              markers=True,
              hover_data=['video_duration', 'theme'])

fig.update_layout(
    legend_title='Run'
)
fig.write_html('plots/p5_video_duration.html')
fig.show()

### Nombre de vues durant le run

In [352]:
df_plot = df_output[~df_output['walk'].isnull()]
df_plot = df_plot[~df_plot['nb_views'].isnull()]
fig = px.line(df_plot, x=df_plot.groupby('walk').cumcount(), y='nb_views',
                 color=df_plot['walk'], title='Nombre de vues durant le run',
                 log_y=True,
                 labels={'nb_views': 'Nombre de vues', 'x': 'Position dans le run', 'theme': 'Thème'},
              markers=True,
              hover_data=['nb_views', 'theme'])

fig.update_layout(
    legend_title='Run'
)
fig.write_html('plots/p5_nb_views.html')
fig.show()

### Nombre de followers des chaînes durant le run

In [394]:
def map_sub(x):
    tens = {'k': 1000, 'm': 1_000_000, 'b': 1_000_000_000}
    if x.isnumeric():
        return float(x)
    elif x[:-1].isnumeric() and x[-1].lower() in tens.keys():
        return float(x[:-1]) * tens[x[-1].lower()]
    else:
        return np.nan

df_plot = df_output[~df_output['walk'].isnull()]
df_plot = df_plot[~df_plot['nb_sub'].isnull()]

df_plot['nb_sub'] = df_plot['nb_sub'].apply(map_sub)
df_plot = df_plot[~df_plot['nb_sub'].isnull()]
df_plot['nb_sub'] = df_plot['nb_sub'].astype(int)

fig = px.line(df_plot, x=df_plot.groupby('walk').cumcount(), y='nb_sub',
              color=df_plot['walk'], title='Nombre de followers de la chaîne durant le run',
              #log_y=True,
              labels={'nb_sub': 'Nombre de followers', 'x': 'Position dans le run', 'theme': 'Thème'},
              markers=True,
              hover_data=['nb_sub', 'theme'])

fig.update_layout(
    legend_title='Run'
)
fig.write_html('plots/p5_nb_sub.html')
fig.show()

### Attribution des topics durant le run

In [395]:
df_plot = df_output[~df_output['walk'].isnull()]
df_plot = df_plot.loc[df_plot['walk'] == 21, :]
fig = px.bar(data_frame=df_plot, x='video_id_in_run', y=[f'topic_id_{i}' for i in range(num_topics)], labels={'variable': 'Topic', 'video_id_in_run': 'Position dans le run', 'value': 'Probabilité du topic'})
newnames = {f'topic_id_{i}': topic_id2name[i] for i in range(num_topics)}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                      )
                   )
fig.update_layout(
    title='Distribution des topics durant le run'
)
fig.write_html('plots/p5_topics_distr.html')
fig.show()

## Color runs based on the most frequent topic in this run

In [128]:
fig = px.scatter(df_output, x='x', y='y', color='walk_best_topic',
                 #hover_data=['walk_best_topic'], hover_name='video_best_topic',

                 width=800, height=800,
                 title='Runs colored by most frequent topic')

customdata = df_output[['video_best_topic', 'walk_best_topic', 'walk']]

fig = go.Figure(go.Scatter(
    x=df_output['x'], y=df_output['y'],
    customdata=customdata,
    marker=dict(color=df_output['walk_best_topic_id'], colorbar=dict(title='Run topics')),
    mode='markers', name='',
    hovertemplate=
    '<br>Video topic : %{customdata[0]}</b><br>Run topic : %{customdata[1]}</b><br>Run id : %{customdata[2]}</br>'
))

fig.update_layout(
    width=1200,
    height=900,
    title='Runs colored by most frequent topic'
)
fig.show()