In [3]:
import pickle

import nltk
import numpy as np
import pandas as pd
import re
from helper_data import *
from helper_preprocess import *
import scipy.sparse
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd

en_stopwords = set(stopwords.words('english'))
en_stopwords.update([s.capitalize() for s in stopwords.words('english')])

# Pipeline de preprocessing des commentaires

Code dans helper_preprocess.py

## Matrice TF-IDF

In [32]:
# Load data
walks_folder = 'data/P3'
walk_path = 'all_p3'
#data = load_all_walks_comments(walks_folder, True)
data = load_all_walks_tags(walks_folder, True)

In [None]:
# Drop the comments where nan occurs
data = data.dropna(subset='keywords')
comments = data['keywords'].tolist()

In [17]:
# Vectorizer pour TF-IDF
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer)

# Transforme le corpus de commentaires
# en une matrice sparse
tfidf_matrix = tfidf_vectorizer.fit_transform(data)

# Sauvegarde de la matrice
scipy.sparse.save_npz('data/tfidf_comments.npz', tfidf_matrix)

# Sauvegarde du vocabulaire : dict(term: feature index)
with open('data/vocabulary.pickle', 'wb') as f:
    pickle.dump(tfidf_vectorizer.vocabulary_, f)

# Latent Semantic Analyis : SVD de la matrice TF-IDF

In [28]:
def project_SVD(X, dim=2, seed=0):
    """
    Calcule la SVD tronquée de la matrice TF-IDF en
    gardant une approximation de rang 'dim' (i.e.
    le nombre de features de l'embedding).
    :param X: TF-IDF sparse matrice
    :param dim: rang de la SVD tronquée
    :param seed: seed for the random SVD
    :return: Embedding des documents, embedding des tokens
    """
    # Calcule la matrice tronquée
    U, sigmas, Vt = randomized_svd(X, n_components=dim, random_state=seed)

    # Embedding des tokens
    X_emb = U @ np.diag(sigmas)

    return X_emb, Vt

In [29]:
def print_features_description(Vt, index_map, top_num=5):
    """
    Affiche les tokens les plus (et moins) importants
    pour chaque features de l'embedding des tokens
    :param Vt: embedding des tokens
    :param index_map: dict(feature index: terme)
    :param top_num: nombre de mots à afficher
    :return: -
    """
    for i in range(Vt.shape[0]):
        sord_idx = np.argsort(Vt[i])
        top_min_idx = sord_idx[:top_num]
        top_max_idx = sord_idx[::-1][:top_num]
        print(f'\nThe top {top_num} max values for feature {i} are:')
        for index in top_max_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print(f'\nThe top {top_num} min values for feature {i} are:')
        for index in top_min_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print("\n")

In [30]:
X_emb, Vt = project_SVD(tfidf_matrix, dim=5)

In [25]:
index_map = {v: k for k, v in tfidf_vectorizer.vocabulary_.items()}

In [26]:
print_features_description(Vt, index_map)


The top 5 max values for feature 0 are:
Nantucket                      0.1891
sea                            0.1854
go                             0.1742
passenger                      0.1570
though                         0.1550

The top 5 min values for feature 0 are:
zephyr                         0.0270
oriental                       0.0270
outside                        0.0270
difference                     0.0270
palsied                        0.0270



The top 5 max values for feature 1 are:
Nantucket                      0.2433
Euroclydon                     0.2155
tempestuous                    0.1293
thou                           0.1293
window                         0.1293

The top 5 min values for feature 1 are:
sea                            -0.1535
upon                           -0.1285
passenger                      -0.1278
image                          -0.0979
broiled                        -0.0971



The top 5 max values for feature 2 are:
Nantucket                 