In [15]:
import pickle

import nltk
import emoji
import numpy as np
import re

import pandas as pd
import scipy.sparse
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd

en_stopwords = set(stopwords.words('english'))
en_stopwords.update([s.capitalize() for s in stopwords.words('english')])

fr_stopwords = set(stopwords.words('french'))
fr_stopwords.update([s.capitalize() for s in stopwords.words('french')])

# Comments preprocessing pipeline

## Comment preprocessing

In [2]:
# Complete preprocessing
def preprocess(comment):
    """
    Preprocess a comment :
        - remove numbers
        - lower first letter of each sentence
        - remove stopwords and words of 1 letter
        - lower the words entirely in capital
        - lemmatize
        - remove again stopwords and words of 1 letter
    :param comment: string containing the comment
    :return: list of the tokens
    """
    # Remove numbers
    comment = re.sub(r'\d+', '', comment)

    # Lower first letter of each sentence
    lower_first_word = lambda tab: ' '.join(tab[0].lower() + tab[1:])
    comment = ' '.join([lower_first_word(sentence.split(' ')) for sentence in comment.split('.')])

    # Tokenize by word
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    words_tokens = tokenizer.tokenize(comment)

    # Remove stopwords and words of length 1
    remove_stopwords = lambda wts: [w for w in wts if (not w in en_stopwords) and len(w) > 1]
    words_tokens = remove_stopwords(words_tokens)

    # Lower capital words
    for i in range(len(words_tokens)):
        if words_tokens[i].isupper():
            words_tokens[i] = words_tokens[i].lower()

    # Lemmatization with WordNet
    lemmatizer = WordNetLemmatizer()
    words_tokens = [lemmatizer.lemmatize(wt) for wt in words_tokens]

    # Remove stopwords and words of length 1
    words_tokens = remove_stopwords(words_tokens)

    return words_tokens

In [76]:
# Preprocessor and tokenizer to be used directly in TfidfVectorizer
def preprocessor(comment):
    """
    Preprocess a comment:
        - remove numbers
        - demojize the comment
        - lower first letter of each sentence
    :param comment: string containing the comment
    :return: string containing the preprocessed comment
    """
    # Remove numbers
    comment = re.sub(r'\d+', '', comment)

    # Demojize the comment
    comment = emoji.demojize(comment)

    # Lower first letter of each sentence
    lower_first_word = lambda tab: ' '.join([tab[0].lower()] + tab[1:])
    comment = ' '.join([lower_first_word(sentence.split(' ')) for sentence in comment.split('.')])

    return comment

def tokenizer(comment):
    """
    Tokenize and process the tokens of a comment :
        - tokenize the comment by word
        - remove stopwords and words of 1 letter
        - lower the words entirely in capital
        - lemmatize
        - remove stopwords and words of 1 letter
    :param comment: string containing the comment
    :return: list of tokens for this comment
    """
    # Tokenize by mot
    tokenizer = nltk.RegexpTokenizer(r'\w+|:\w+:')
    words_tokens = tokenizer.tokenize(comment)

    # Remove stopwords and words of lenght 1
    remove_stopwords = lambda wts: [w for w in wts if (not w in fr_stopwords) and len(w) > 1]
    words_tokens = remove_stopwords(words_tokens)

    # Lower capital words
    for i in range(len(words_tokens)):
        if words_tokens[i].isupper():
            words_tokens[i] = words_tokens[i].lower()

    # Lemmatization with WordNet
    lemmatizer = WordNetLemmatizer()
    words_tokens = [lemmatizer.lemmatize(wt) for wt in words_tokens]

    # Remove stopwords and words of lenght 1
    words_tokens = remove_stopwords(words_tokens)

    return words_tokens

## TF-IDF Matrix

In [77]:
# Load the data
data = pd.read_csv('data/comments.csv', index_col=0)

In [78]:
data.head()

Unnamed: 0,channel_link,channel_name,text,nb_like
0,https://www.youtube.com/channel/UCGRE-vFMetjgn...,No Name,La RTS svp mettez le match entier sur Youtube ...,2300
1,https://www.youtube.com/channel/UCh259KSVuFzcq...,léa,"Les meilleurs commentateurs, tout simplement! ...",2100
2,https://www.youtube.com/channel/UC4pupP9eDgmtB...,Aurélie Manon FR,A 3-1 je suis partie défaitiste mais ce match ...,1400
3,https://www.youtube.com/channel/UCyzr5c0CAxaGa...,Ethan Stähli,"Même des mois plus tard, chaques fois que je r...",9
4,https://www.youtube.com/channel/UCGOLHrn4shwFq...,Emil Petrov,"Même si la suisse avait perdu, ce match rester...",777


In [79]:
comments = data['text'].tolist()

In [80]:
comments

['La RTS svp mettez le match entier sur Youtube c’est historique ! Liker en masse',
 "Les meilleurs commentateurs, tout simplement! Leur émotion, leur sympathie et leur joie à toute épreuve font d'eux des trésors nationaux.",
 "A 3-1 je suis partie défaitiste mais ce match m'a donné la leçon de ma vie : ne jamais désespérer peu importe qui tu as en face de toi, continues à te battre. Merci mon pays de me rappeler les bases ce soir 🇨🇭",
 'Même des mois plus tard, chaques fois que je regarde ce match, je suis tellement contennntt, les sensations sont incroyable, la chaire de poule, le sourire\nMerci la Nati️️',
 "Même si la suisse avait perdu, ce match resterait magnifique. Intense du début à la fin, on a su revenir dans le dernier quart d'heure !",
 'J’ai vécu la soirée de sport la plus intense de toute ma vie. Lors du 3-1 j’étais sûr à 100% qu’on allait perdre. Merci pour tout c’était exceptionnel. Et l’ambiance dans les rues de Neuchâtel, de la pure folie.',
 'INCROYABLE c’est telleme

In [81]:
# Vectorizer for TF-IDF
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer)

# Transform the comments corpus
# into a sparse TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(comments)

# Save the matrix
scipy.sparse.save_npz('data/tfidf_comments.npz', tfidf_matrix)

# Save the vocabulary : dict(term: feature index)
with open('data/vocabulary.pickle', 'wb') as f:
    pickle.dump(tfidf_vectorizer.vocabulary_, f)

# Latent Semantic Analyis : SVD of the TF-IDF matrix

In [82]:
def project_SVD(X, dim=2, seed=0):
    """
    Compute the truncated SVD of the matrix X, keeping
    an approximation of rank dim (i.e. dim features
    in the embedding)
    :param X: TF-IDF sparse matrix
    :param dim: rank of the truncated SVD
    :param seed: seed for the random SVD
    :return: Documents embedding, tokens embedding
    """
    # Compute the truncated SVD
    U, sigmas, Vt = randomized_svd(X, n_components=dim, random_state=seed)

    # Tokens embeddings
    X_emb = U @ np.diag(sigmas)

    return X_emb, Vt

In [83]:
def print_features_description(Vt, index_map, top_num=5):
    """
    Print the most important tokens for each
    features of the tokens embeddings.
    :param Vt: tokens embedding
    :param index_map: dict(feature index: terme)
    :param top_num: number of tokens to print
    :return: -
    """
    for i in range(Vt.shape[0]):
        sord_idx = np.argsort(Vt[i])
        top_min_idx = sord_idx[:top_num]
        top_max_idx = sord_idx[::-1][:top_num]
        print(f'\nThe top {top_num} max values for feature {i} are:')
        for index in top_max_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print(f'\nThe top {top_num} min values for feature {i} are:')
        for index in top_min_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print("\n")

In [84]:
X_emb, Vt = project_SVD(tfidf_matrix, dim=5)

In [85]:
index_map = {v: k for k, v in tfidf_vectorizer.vocabulary_.items()}

In [86]:
print_features_description(Vt, index_map)


The top 5 max values for feature 0 are:
:Switzerland:                  0.6617
match                          0.3043
suisse                         0.1914
Suisse                         0.1690
incroyable                     0.1552

The top 5 min values for feature 0 are:
go                             0.0002
Do                             0.0002
Love                           0.0002
like                           0.0002
in                             0.0002



The top 5 max values for feature 1 are:
:Switzerland:                  0.6117
detre                          0.0744
fierte                         0.0744
magnifiques                    0.0406
joué                           0.0403

The top 5 min values for feature 1 are:
match                          -0.2596
plus                           -0.2047
Suisse                         -0.1891
incroyable                     -0.1820
équipe                         -0.1772



The top 5 max values for feature 2 are:
juste                     