In [1]:
import pickle

import nltk
import numpy as np
import pandas as pd
import re

import scipy.sparse
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd

en_stopwords = set(stopwords.words('english'))
en_stopwords.update([s.capitalize() for s in stopwords.words('english')])

# Comments preprocessing pipeline

## Comment preprocessing

In [2]:
# Complete preprocessing
def preprocess(comment):
    """
    Preprocess a comment :
        - remove numbers
        - lower first letter of each sentence
        - remove stopwords and words of 1 letter
        - lower the words entirely in capital
        - lemmatize
        - remove again stopwords and words of 1 letter
    :param comment: string containing the comment
    :return: list of the tokens
    """
    # Remove numbers
    comment = re.sub(r'\d+', '', comment)

    # Lower first letter of each sentence
    lower_first_word = lambda tab: ' '.join(tab[0].lower() + tab[1:])
    comment = ' '.join([lower_first_word(sentence.split(' ')) for sentence in comment.split('.')])

    # Tokenize by word
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    words_tokens = tokenizer.tokenize(comment)

    # Remove stopwords and words of length 1
    remove_stopwords = lambda wts: [w for w in wts if (not w in en_stopwords) and len(w) > 1]
    words_tokens = remove_stopwords(words_tokens)

    # Lower capital words
    for i in range(len(words_tokens)):
        if words_tokens[i].isupper():
            words_tokens[i] = words_tokens[i].lower()

    # Lemmatization with WordNet
    lemmatizer = WordNetLemmatizer()
    words_tokens = [lemmatizer.lemmatize(wt) for wt in words_tokens]

    # Remove stopwords and words of length 1
    words_tokens = remove_stopwords(words_tokens)

    return words_tokens

In [3]:
# Preprocessor and tokenizer to be used directly in TfidfVectorizer
def preprocessor(comment):
    """
    Preprocess a comment:
        - remove numbers
        - lower first letter of each sentence
    :param comment: string containing the comment
    :return: string containing the preprocessed comment
    """
    # Remove numbers
    comment = re.sub(r'\d+', '', comment)

    # Lower first letter of each sentence
    lower_first_word = lambda tab: ' '.join([tab[0].lower()] + tab[1:])
    comment = ' '.join([lower_first_word(sentence.split(' ')) for sentence in comment.split('.')])

    return comment

def tokenizer(comment):
    """
    Tokenize and process the tokens of a comment :
        - tokenize the comment by word
        - remove stopwords and words of 1 letter
        - lower the words entirely in capital
        - lemmatize
        - remove stopwords and words of 1 letter
    :param comment: string containing the comment
    :return: list of tokens for this comment
    """
    # Tokenize by mot
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    words_tokens = tokenizer.tokenize(comment)

    # Remove stopwords and words of lenght 1
    remove_stopwords = lambda wts: [w for w in wts if (not w in en_stopwords) and len(w) > 1]
    words_tokens = remove_stopwords(words_tokens)

    # Lower capital words
    for i in range(len(words_tokens)):
        if words_tokens[i].isupper():
            words_tokens[i] = words_tokens[i].lower()

    # Lemmatization with WordNet
    lemmatizer = WordNetLemmatizer()
    words_tokens = [lemmatizer.lemmatize(wt) for wt in words_tokens]

    # Remove stopwords and words of lenght 1
    words_tokens = remove_stopwords(words_tokens)

    return words_tokens

## TF-IDF Matrix

In [4]:
# Load the data
data = []

In [5]:
# Vectorizer for TF-IDF
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessor, tokenizer=tokenizer)

# Transform the comments corpus
# into a sparse TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(data)

# Save the matrix
scipy.sparse.save_npz('data/tfidf_comments.npz', tfidf_matrix)

# Save the vocabulary : dict(term: feature index)
with open('data/vocabulary.pickle', 'wb') as f:
    pickle.dump(tfidf_vectorizer.vocabulary_, f)

ValueError: empty vocabulary; perhaps the documents only contain stop words

# Latent Semantic Analyis : SVD of the TF-IDF matrix

In [6]:
def project_SVD(X, dim=2, seed=0):
    """
    Compute the truncated SVD of the matrix X, keeping
    an approximation of rank dim (i.e. dim features
    in the embedding)
    :param X: TF-IDF sparse matrix
    :param dim: rank of the truncated SVD
    :param seed: seed for the random SVD
    :return: Documents embedding, tokens embedding
    """
    # Compute the truncated SVD
    U, sigmas, Vt = randomized_svd(X, n_components=dim, random_state=seed)

    # Tokens embeddings
    X_emb = U @ np.diag(sigmas)

    return X_emb, Vt

In [7]:
def print_features_description(Vt, index_map, top_num=5):
    """
    Print the most important tokens for each
    features of the tokens embeddings.
    :param Vt: tokens embedding
    :param index_map: dict(feature index: terme)
    :param top_num: number of tokens to print
    :return: -
    """
    for i in range(Vt.shape[0]):
        sord_idx = np.argsort(Vt[i])
        top_min_idx = sord_idx[:top_num]
        top_max_idx = sord_idx[::-1][:top_num]
        print(f'\nThe top {top_num} max values for feature {i} are:')
        for index in top_max_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print(f'\nThe top {top_num} min values for feature {i} are:')
        for index in top_min_idx:
            print(f'{index_map[index]:<30} {Vt[i,index]:.4f}')

        print("\n")

In [30]:
X_emb, Vt = project_SVD(tfidf_matrix, dim=5)

In [25]:
index_map = {v: k for k, v in tfidf_vectorizer.vocabulary_.items()}

In [26]:
print_features_description(Vt, index_map)


The top 5 max values for feature 0 are:
Nantucket                      0.1891
sea                            0.1854
go                             0.1742
passenger                      0.1570
though                         0.1550

The top 5 min values for feature 0 are:
zephyr                         0.0270
oriental                       0.0270
outside                        0.0270
difference                     0.0270
palsied                        0.0270



The top 5 max values for feature 1 are:
Nantucket                      0.2433
Euroclydon                     0.2155
tempestuous                    0.1293
thou                           0.1293
window                         0.1293

The top 5 min values for feature 1 are:
sea                            -0.1535
upon                           -0.1285
passenger                      -0.1278
image                          -0.0979
broiled                        -0.0971



The top 5 max values for feature 2 are:
Nantucket                 