In [1]:
import pickle

import nltk
import numpy as np
import pandas as pd
import re

import scipy.sparse
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd

en_stopwords = set(stopwords.words('english'))
en_stopwords.update([s.capitalize() for s in stopwords.words('english')])

# Pipeline de preprocessing des commentaires

## Load data

In [2]:
def load_walk(path, mode):
    """
    Load the data of a walk
    :param path: path of the file
    :param mode: what data to load
    :return: dataframe containing the data of the walk
    """
    use_cols = None
    if mode == 'comments':
        use_cols = range(1, 6)
    elif mode == 'infos':
        use_cols = range(1, 10)
    else:
        print('Mode not supported')
        return
    return pd.read_csv(path, compression='bz2', usecols=use_cols)


def load_from_folder(folder, processing, mode):
    """
    Load the comments of all the walks
    present in the given folder
    :param folder: folder containing the walks
    :param processing: function that process each file of
    the folder
    :param mode: what data to load
    :return: dataframe containing all the comments
    """
    dfs = []
    walks = 0

    with os.scandir(folder) as it:
        for entry in it:
            if entry.name.endswith(f'{mode}.csv.bz2') and entry.is_file():
                df = processing(os.path.join(folder, entry.name), walks)
                dfs.append(df)
                walks += 1

    return pd.concat(dfs)


def load_all_walks_comments(folder, keep_en):
    """
    Load the comments of all the walks
    present in the given folder
    :param folder: folder containing the walks
    :param keep_en: only keeps english comments
    :return: dataframe containing all the comments
    """
    def processing(path, walk):
        df = load_walk(path, 'comments')
        if keep_en:
            df = df.loc[df['text'].apply(lambda x: nlp(str(x))._.language['language'] == 'en'), :]
        df['walk'] = walk
        return df

    return load_from_folder(folder, processing, 'comments')

def load_all_walks_tags(folder, keep_en):
    """
    Load the tags of all the walks
    present in the given folder
    :param folder: folder containing the walks
    :param keep_en: only keeps english tags
    :return: dataframe containing all the tags
    """
    def processing(path, walk):
        df = load_walk(path, 'infos')
        def process(tags):
            res = []
            for t in tags[1:-1].replace("'", '').split(', '):
                if keep_en:
                    if nlp(str(t))._.language['language'] == 'en':
                        res.append(t)
                else:
                    res.append(t)

            return ' '.join(res) if len(res) > 0 else np.nan

        df['keywords'] = df['keywords'].apply(process)
        df['walk'] = walk
        return df

    return load_from_folder(folder, processing, 'infos')

## Preprocessing d'un commentaire

In [2]:
def preprocess(comment):
    """
    Preprocess un commentaire :
        - supprime les nombres
        - passe la première lettre de chaque phrase en minuscule
        - supprime les stopwords et mots de 1 lettre
        - passe en minuscules les mots tout en majuscules
        - applique une lemmatization
        - supprime à nouveau les stopwords et mots de 1 lettre
    :param comment: string contenant le commentaire
    :return:
    """
    # Supprime les nombres
    comment = re.sub(r'\d+', '', comment)

    # Première lettre de chaque phrase en minuscule
    lower_first_word = lambda tab: ' '.join(tab[0].lower() + tab[1:])
    comment = ' '.join([lower_first_word(sentence.split(' ')) for sentence in comment.split('.')])

    # Tokenize par mot
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    words_tokens = tokenizer.tokenize(comment)

    # Supprime les stopwords et mots de 1 lettre
    remove_stopwords = lambda wts: [w for w in wts if (not w in en_stopwords) and len(w) > 1]
    words_tokens = remove_stopwords(words_tokens)

    # Passe les mots en majuscules en minuscules
    for i in range(len(words_tokens)):
        if words_tokens[i].isupper():
            words_tokens[i] = words_tokens[i].lower()

    # Lemmatization avec WordNet
    lemmatizer = WordNetLemmatizer()
    words_tokens = [lemmatizer.lemmatize(wt) for wt in words_tokens]

    # Supprime les stopwords et mots de 1 lettre
    words_tokens = remove_stopwords(words_tokens)

    return words_tokens

In [15]:
def preprocessor(comment):
    """
    Preprocess un commentaire:
        - supprime les nombres
        - passe la première lettre de chaque phrase en minuscule
    :param comment: commentaire à traiter
    :return: commentaire traité
    """
    # Supprime les nombres
    comment = re.sub(r'\d+', '', comment)

    # Première lettre de chaque phrase en minuscule
    lower_first_word = lambda tab: ' '.join([tab[0].lower()] + tab[1:])
    comment = ' '.join([lower_first_word(sentence.split(' ')) for sentence in comment.split('.')])

    return comment

def tokenizer(comment):
    """
    Tokenize et process les tokens d'un commentaire
    :param comment: commentaire à traiter
    :return: list des tokens pour ce commentaire
    """
    # Tokenize par mot
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    words_tokens = tokenizer.tokenize(comment)

    # Supprime les stopwords et mots de 1 lettre
    remove_stopwords = lambda wts: [w for w in wts if (not w in en_stopwords) and len(w) > 1]
    words_tokens = remove_stopwords(words_tokens)

    # Passe les mots en majuscules en minuscules
    for i in range(len(words_tokens)):
        if words_tokens[i].isupper():
            words_tokens[i] = words_tokens[i].lower()

    # Lemmatization avec WordNet
    lemmatizer = WordNetLemmatizer()
    words_tokens = [lemmatizer.lemmatize(wt) for wt in words_tokens]

    # Supprime les stopwords et mots de 1 lettre
    words_tokens = remove_stopwords(words_tokens)

    return words_tokens

## Matrice TF-IDF

In [32]:
# Load data
walks_folder = 'data/P3'
walk_path = 'all_p3'
#data = load_all_walks_comments(walks_folder, True)
data = load_all_walks_tags(walks_folder, True)

In [None]:
# Drop the comments where nan occurs
data = data.dropna(subset='keywords')
comments = data['keywords'].tolist()

In [17]:
# Vectorizer pour TF-IDF
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer)

# Transforme le corpus de commentaires
# en une matrice sparse
tfidf_matrix = tfidf_vectorizer.fit_transform(data)

# Sauvegarde de la matrice
scipy.sparse.save_npz('data/tfidf_comments.npz', tfidf_matrix)

# Sauvegarde du vocabulaire : dict(term: feature index)
with open('data/vocabulary.pickle', 'wb') as f:
    pickle.dump(tfidf_vectorizer.vocabulary_, f)

# Latent Semantic Analyis : SVD de la matrice TF-IDF

In [28]:
def project_SVD(X, dim=2, seed=0):
    """
    Calcule la SVD tronquée de la matrice TF-IDF en
    gardant une approximation de rang 'dim' (i.e.
    le nombre de features de l'embedding).
    :param X: TF-IDF sparse matrice
    :param dim: rang de la SVD tronquée
    :param seed: seed for the random SVD
    :return: Embedding des documents, embedding des tokens
    """
    # Calcule la matrice tronquée
    U, sigmas, Vt = randomized_svd(X, n_components=dim, random_state=seed)

    # Embedding des tokens
    X_emb = U @ np.diag(sigmas)

    return X_emb, Vt

In [29]:
def print_features_description(Vt, index_map, top_num=5):
    """
    Affiche les tokens les plus (et moins) importants
    pour chaque features de l'embedding des tokens
    :param Vt: embedding des tokens
    :param index_map: dict(feature index: terme)
    :param top_num: nombre de mots à afficher
    :return: -
    """
    for i in range(Vt.shape[0]):
        sord_idx = np.argsort(Vt[i])
        top_min_idx = sord_idx[:top_num]
        top_max_idx = sord_idx[::-1][:top_num]
        print(f'\nThe top {top_num} max values for feature {i} are:')
        for index in top_max_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print(f'\nThe top {top_num} min values for feature {i} are:')
        for index in top_min_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print("\n")

In [30]:
X_emb, Vt = project_SVD(tfidf_matrix, dim=5)

In [25]:
index_map = {v: k for k, v in tfidf_vectorizer.vocabulary_.items()}

In [26]:
print_features_description(Vt, index_map)


The top 5 max values for feature 0 are:
Nantucket                      0.1891
sea                            0.1854
go                             0.1742
passenger                      0.1570
though                         0.1550

The top 5 min values for feature 0 are:
zephyr                         0.0270
oriental                       0.0270
outside                        0.0270
difference                     0.0270
palsied                        0.0270



The top 5 max values for feature 1 are:
Nantucket                      0.2433
Euroclydon                     0.2155
tempestuous                    0.1293
thou                           0.1293
window                         0.1293

The top 5 min values for feature 1 are:
sea                            -0.1535
upon                           -0.1285
passenger                      -0.1278
image                          -0.0979
broiled                        -0.0971



The top 5 max values for feature 2 are:
Nantucket                 