In [4]:
import os

import pickle

import nltk
import emoji
import numpy as np
import re

import pandas as pd
import scipy.sparse
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd

en_stopwords = set(stopwords.words('english'))
en_stopwords.update([s.capitalize() for s in stopwords.words('english')])

#fr_stopwords = set(stopwords.words('french'))
#fr_stopwords.update([s.capitalize() for s in stopwords.words('french')])

import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language

@Language.factory('language_detector')
def language_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x7f07429f00d0>

# Comments preprocessing pipeline

## Loading data

In [61]:
def load_walk(path, mode):
    """
    Load the data of a walk
    :param path: path of the file
    :param mode: what data to load
    :return: dataframe containing the data of the walk
    """
    use_cols = None
    if mode == 'comments':
        use_cols = range(1, 6)
    elif mode == 'infos':
        use_cols = range(1, 10)
    else:
        print('Mode not supported')
        return
    return pd.read_csv(path, compression='bz2', usecols=use_cols)


def load_from_folder(folder, processing, mode):
    """
    Load the comments of all the walks
    present in the given folder
    :param folder: folder containing the walks
    :param processing: function that process each file of
    the folder
    :param mode: what data to load
    :return: dataframe containing all the comments
    """
    dfs = []
    walks = 0

    with os.scandir(folder) as it:
        for entry in it:
            if entry.name.endswith(f'{mode}.csv.bz2') and entry.is_file():
                df = processing(os.path.join(folder, entry.name), walks)
                dfs.append(df)
                walks += 1

    return pd.concat(dfs)


def load_all_walks_comments(folder, keep_en):
    """
    Load the comments of all the walks
    present in the given folder
    :param folder: folder containing the walks
    :param keep_en: only keeps english comments
    :return: dataframe containing all the comments
    """
    def processing(path, walk):
        df = load_walk(path, 'comments')
        if keep_en:
            df = df.loc[df['text'].apply(lambda x: nlp(str(x))._.language['language'] == 'en'), :]
        df['walk'] = walk
        return df

    return load_from_folder(folder, processing, 'comments')

def load_all_walks_tags(folder, keep_en):
    """
    Load the tags of all the walks
    present in the given folder
    :param folder: folder containing the walks
    :param keep_en: only keeps english tags
    :return: dataframe containing all the tags
    """
    def processing(path, walk):
        df = load_walk(path, 'infos')
        def process(tags):
            res = []
            for t in tags[1:-1].replace("'", '').split(', '):
                if keep_en:
                    if nlp(str(t))._.language['language'] == 'en':
                        res.append(t)
                else:
                    res.append(t)

            return ' '.join(res) if len(res) > 0 else np.nan

        df['keywords'] = df['keywords'].apply(process)
        df['walk'] = walk
        return df

    return load_from_folder(folder, processing, 'infos')

## Comment preprocessing

In [7]:
# Complete preprocessing
def preprocess(comment):
    """
    Preprocess a comment :
        - remove numbers
        - lower first letter of each sentence
        - remove stopwords and words of 1 letter
        - lower the words entirely in capital
        - lemmatize
        - remove again stopwords and words of 1 letter
    :param comment: string containing the comment
    :return: list of the tokens
    """
    # Remove numbers
    comment = re.sub(r'\d+', '', comment)

    # Lower first letter of each sentence
    lower_first_word = lambda tab: ' '.join(tab[0].lower() + tab[1:])
    comment = ' '.join([lower_first_word(sentence.split(' ')) for sentence in comment.split('.')])

    # Tokenize by word
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    words_tokens = tokenizer.tokenize(comment)

    # Remove stopwords and words of length 1
    remove_stopwords = lambda wts: [w for w in wts if (not w in en_stopwords) and len(w) > 1]
    words_tokens = remove_stopwords(words_tokens)

    # Lower capital words
    for i in range(len(words_tokens)):
        if words_tokens[i].isupper():
            words_tokens[i] = words_tokens[i].lower()

    # Lemmatization with WordNet
    lemmatizer = WordNetLemmatizer()
    words_tokens = [lemmatizer.lemmatize(wt) for wt in words_tokens]

    # Remove stopwords and words of length 1
    words_tokens = remove_stopwords(words_tokens)

    return words_tokens

In [8]:
# Preprocessor and tokenizer to be used directly in TfidfVectorizer
def preprocessor(comment):
    """
    Preprocess a comment:
        - remove numbers
        - demojize the comment
        - lower first letter of each sentence
    :param comment: string containing the comment
    :return: string containing the preprocessed comment
    """
    # Remove numbers
    comment = re.sub(r'\d+', '', comment)

    # Demojize the comment
    comment = emoji.demojize(comment)

    # Lower first letter of each sentence
    lower_first_word = lambda tab: ' '.join([tab[0].lower()] + tab[1:])
    comment = ' '.join([lower_first_word(sentence.split(' ')) for sentence in comment.split('.')])

    return comment

def tokenizer(comment):
    """
    Tokenize and process the tokens of a comment :
        - tokenize the comment by word
        - remove stopwords and words of 1 letter
        - lower the words entirely in capital
        - lemmatize
        - remove stopwords and words of 1 letter
    :param comment: string containing the comment
    :return: list of tokens for this comment
    """
    # Tokenize by word
    tokenizer = nltk.RegexpTokenizer(r'\w+|:\w+:')
    words_tokens = tokenizer.tokenize(comment)

    # Remove stopwords and words of lenght 1
    remove_stopwords = lambda wts: [w for w in wts if (not w in en_stopwords) and len(w) > 1]
    words_tokens = remove_stopwords(words_tokens)

    # Lower capital words
    for i in range(len(words_tokens)):
        if words_tokens[i].isupper():
            words_tokens[i] = words_tokens[i].lower()

    # Lemmatization with WordNet
    lemmatizer = WordNetLemmatizer()
    words_tokens = [lemmatizer.lemmatize(wt) for wt in words_tokens]

    # Remove stopwords and words of lenght 1
    words_tokens = remove_stopwords(words_tokens)

    return words_tokens

## TF-IDF Matrix

In [62]:
# Load the data
#walk_path = '2022_03_20.20_28_27.0'
#data = load_walk(f'data/P3/{walk_path}.comments.csv.bz2')

walks_folder = 'data/P3'
walk_path = 'all_p3'
#data = load_all_walks_comments(walks_folder, True)
data = load_all_walks_tags(walks_folder, True)

In [63]:
data.head()

Unnamed: 0,video_link,title,description,channel_link,channel_title,keywords,nb_like,nb_views,nb_sub,walk
0,https://www.youtube.com/watch?v=Vaz_kpmTi0M,"Hey Mama - David Guetta ft. Nicki Minaj, Bebe ...",Minny Park teaches choreography to Hey Mama by...,1MILLION Dance Studio,https://www.youtube.com/channel/UCw8ZhLPdQ0u_Y...,choreography hiphop,1542688,118020394.0,25M,0
1,https://www.youtube.com/watch?v=Kl5B6MBAntI,,ED SHEERAN - Shape Of You | Dance Choreography...,KYLE HANAGAMI,https://www.youtube.com/channel/UCGzGbfhdFsjP1...,ed sheeran Shape of you hip hop choreograph...,2022904,,4.54M,0
2,https://www.youtube.com/watch?v=0HKfjsM2hSw,#JusticeForMOMOLAND #MerriesStandWithDAISY Mom...,Daisy says ‘Finding Momoland’ survival show wa...,kurokuroku,https://www.youtube.com/channel/UCRUE1E1_DOnXN...,momoland comeback momoland new song momolan...,588633,227955367.0,,0
3,https://www.youtube.com/watch?v=dhQtq9YxrGI,TAKI TAKI | @placedancers Choreography Julie B,,PLACE,https://www.youtube.com/channel/UCo3BbgAZgkT7T...,choreography,53433,15609749.0,375K,0
4,https://www.youtube.com/watch?v=r6q3SkBIdTI,"MOMOLAND(모모랜드) - ""BAAM"" Moving Dance Practice","MOMOLAND(모모랜드) - ""BAAM"" Moving Dance Practice\...",MLD ENTERTAINMENT,https://www.youtube.com/channel/UC62GnXAfqgm8F...,,845199,224876346.0,1.91M,0


In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 525 entries, 0 to 45
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   video_link     525 non-null    object
 1   title          509 non-null    object
 2   description    453 non-null    object
 3   channel_link   525 non-null    object
 4   channel_title  525 non-null    object
 5   keywords       492 non-null    object
 6   nb_like        525 non-null    object
 7   nb_views       525 non-null    object
 8   nb_sub         525 non-null    object
 9   walk           525 non-null    int64 
dtypes: int64(1), object(9)
memory usage: 45.1+ KB


In [65]:
data.isna().sum()

video_link        0
title            16
description      72
channel_link      0
channel_title     0
keywords         33
nb_like           0
nb_views          0
nb_sub            0
walk              0
dtype: int64

In [66]:
# Drop the comments where nan occurs
data = data.dropna(subset='keywords')

In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 492 entries, 0 to 45
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   video_link     492 non-null    object
 1   title          476 non-null    object
 2   description    426 non-null    object
 3   channel_link   492 non-null    object
 4   channel_title  492 non-null    object
 5   keywords       492 non-null    object
 6   nb_like        492 non-null    object
 7   nb_views       492 non-null    object
 8   nb_sub         492 non-null    object
 9   walk           492 non-null    int64 
dtypes: int64(1), object(9)
memory usage: 42.3+ KB


We had 5675 non nan entries before filtering on the language.
-> filter during crawl ?

In [68]:
#comments = data['text'].tolist()
comments = data['keywords'].tolist()

In [69]:
comments

['choreography  hiphop',
 'ed sheeran  Shape of you  hip hop  choreography  castle on a hill  shape of you ed  shape of you ed sheeran  cover  shape of you lyrics  shape of you dance  perfect ed sheeran " hearts dont break around here"  ed sheeran new album',
 ' momoland comeback  momoland new song  momoland bboom bboom  momoland boom boom  momoland boom  momoland meme  momoland rap  momoland new release  momoland album',
 ' choreography',
 'New',
 ' world cup song  This Time for Africa  FIFA World Cup Song  Addicted to You  whenever wherever  super bowl  super bowl halftime  halftime show  JLO halftime',
 ' sexiest women  try not to sing  oh baby when you talk like that shakira  power hour " hips dont lie shakira lyrics"  4th of July songs " 2000s hits"  Wyclef Jean',
 ' the ketchup song',
 'a little bit of  acoustic  official  lyrics  official video',
 ' acoustic  fighter  xtina official  official  Xtina candyman live  official video  cover  what a girl wants  back to basics  Christi

In [71]:
# Vectorizer for TF-IDF
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer)

# Transform the comments corpus
# into a sparse TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(comments)

# Save the matrix
scipy.sparse.save_npz(f'data/TF_IDF/{walk_path}_tfidf_comments.npz', tfidf_matrix)

# Save the vocabulary : dict(term: feature index)
with open(f'data/vocabulary/{walk_path}_vocabulary.pickle', 'wb') as f:
    pickle.dump(tfidf_vectorizer.vocabulary_, f)

# Latent Semantic Analyis : SVD of the TF-IDF matrix

In [21]:
def project_SVD(X, dim=2, seed=0):
    """
    Compute the truncated SVD of the matrix X, keeping
    an approximation of rank dim (i.e. dim features
    in the embedding)
    :param X: TF-IDF sparse matrix
    :param dim: rank of the truncated SVD
    :param seed: seed for the random SVD
    :return: Documents embedding, tokens embedding
    """
    # Compute the truncated SVD
    U, sigmas, Vt = randomized_svd(X, n_components=dim, random_state=seed)

    # Tokens embeddings
    X_emb = U @ np.diag(sigmas)

    return X_emb, Vt

In [26]:
def print_features_description(Vt, index_map, top_num=10):
    """
    Print the most important tokens for each
    features of the tokens embeddings.
    :param Vt: tokens embedding
    :param index_map: dict(feature index: terme)
    :param top_num: number of tokens to print
    :return: -
    """
    for i in range(Vt.shape[0]):
        sord_idx = np.argsort(Vt[i])
        top_min_idx = sord_idx[:top_num]
        top_max_idx = sord_idx[::-1][:top_num]
        print(f'\nThe top {top_num} max values for feature {i} are:')
        for index in top_max_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print(f'\nThe top {top_num} min values for feature {i} are:')
        for index in top_min_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print("\n")

In [72]:
X_emb, Vt = project_SVD(tfidf_matrix, dim=10)

In [73]:
index_map = {v: k for k, v in tfidf_vectorizer.vocabulary_.items()}

In [74]:
print_features_description(Vt, index_map)


The top 10 max values for feature 0 are:
free                           0.6208
camera                         0.5560
phone                          0.5525
gas                            0.0139
use                            0.0039
garbage                        0.0037
waste                          0.0035
water                          0.0034
pump                           0.0034
make                           0.0032

The top 10 min values for feature 0 are:
country                        -0.0000
Hilson                         -0.0000
yo                             -0.0000
ne                             -0.0000
Lipstick                       -0.0000
Secret                         -0.0000
Little                         -0.0000
Traces                         -0.0000
lithium                        -0.0000
repacking                      -0.0000



The top 10 max values for feature 1 are:
bottle                         0.4404
flip                           0.4378
dude                      