# Import necessary dependencies and settings

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alberto.Romero\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Sample corpus of text documents

In [2]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick!,animals


# Simple text pre-processing

In [3]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
    
    # ^ que NO sea, \s es espacio en blanco, lo cambia a '',
    # re.I es ignore case
    
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    print(tokens)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    print(filtered_tokens)
    doc = ' '.join(filtered_tokens) # join junta todos los iterables en un string
    return doc

normalize_corpus = np.vectorize(normalize_document)

Veamos qué hace np.vectorize().

Imaginemos una función que acepta un número y devuelve True o False si el número es o no par

In [4]:
def is_even_single(m):
    if m%2 == 0:
        return True
    else:
        return False

In [5]:
# Probemos la función con un par de valores:

is_even_single(1)

False

In [6]:
is_even_single(4)

True

Esta función acepta un escalar, por lo que intentar usarla con un array NumPy devolverá un error. Pero podemos "vectorizarla" con la función np.vectorize

In [7]:
is_even = np.vectorize(is_even_single)

In [8]:
# Ahora ya es posible usarla con arrays
m = np.array([1, 2, 3, 4, 5])
is_even(m)

array([False,  True, False,  True, False])

Volvamos a procesamiento de Texto.

In [9]:
corpus

array(['The sky is blue and beautiful.',
       'Love this blue and beautiful sky!',
       'The quick brown fox jumps over the lazy dog.',
       'The brown fox is quick and the blue dog is lazy!',
       'The sky is very blue and the sky is very beautiful today',
       'The dog is lazy but the brown fox is quick!'], dtype='<U56')

In [10]:
frase = 'The sky is blue and beautiful.'
normalize_document(frase)

['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']


'sky blue beautiful'

In [11]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']
['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']
['love', 'this', 'blue', 'and', 'beautiful', 'sky']
['love', 'blue', 'beautiful', 'sky']
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
['the', 'brown', 'fox', 'is', 'quick', 'and', 'the', 'blue', 'dog', 'is', 'lazy']
['brown', 'fox', 'quick', 'blue', 'dog', 'lazy']
['the', 'sky', 'is', 'very', 'blue', 'and', 'the', 'sky', 'is', 'very', 'beautiful', 'today']
['sky', 'blue', 'sky', 'beautiful', 'today']
['the', 'dog', 'is', 'lazy', 'but', 'the', 'brown', 'fox', 'is', 'quick']
['dog', 'lazy', 'brown', 'fox', 'quick']


array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U30')

In [12]:
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U30')

# Bag of Words Model

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.) # son los cortes para el vocabulario según las frecuencias en documentos, tomamos todas
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

# 'love' es la columna octava

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1],
       [0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [14]:
vocab = cv.get_feature_names()
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,1,1,0,0,0,0,0,0,0,1,0
1,1,1,0,0,0,0,0,1,0,1,0
2,0,0,1,1,1,1,1,0,1,0,0
3,0,1,1,1,1,0,1,0,1,0,0
4,1,1,0,0,0,0,0,0,0,2,1
5,0,0,1,1,1,0,1,0,1,0,0


# Bag of N-Grams Model

In [15]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
bigramas= pd.DataFrame(bv_matrix, columns=vocab)
bigramas.shape



(6, 17)

In [16]:
bigramas

Unnamed: 0,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,brown fox,dog lazy,fox jumps,fox quick,jumps lazy,lazy brown,lazy dog,love blue,quick blue,quick brown,sky beautiful,sky blue
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0
3,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
5,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0


# TF-IDF Model

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)


# TF - IDF = frecuencia - inversa = cuántas veces aparece la palabra x log(N/documentos que la contienen)

# en clase:
# si buscamos 'fox' en Google, ¿qué documento nos devuelve?
# el 5
# ¿Y 'fox sky'? El 4 porque no coinciden los dos términos a la vez
# ¿Y 'love sky'? ¿el 4, el 1? Depende, el algoritmo está pensado para un término 

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,0.6,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0
1,0.46,0.39,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.46,0.0
2,0.0,0.0,0.38,0.38,0.38,0.54,0.38,0.0,0.38,0.0,0.0
3,0.0,0.36,0.42,0.42,0.42,0.0,0.42,0.0,0.42,0.0,0.0
4,0.36,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.52
5,0.0,0.0,0.45,0.45,0.45,0.0,0.45,0.0,0.45,0.0,0.0


# Document Similarity

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

# ángulo coseno entre los puntos (vectores) que representan las 6 frases en las 11 dimensiones
# la frase 3 y 5 son las más cercanas
# la frase 3 y la 4 son las más lejanas


Unnamed: 0,0,1,2,3,4,5
0,1.0,0.753128,0.0,0.185447,0.807539,0.0
1,0.753128,1.0,0.0,0.139665,0.608181,0.0
2,0.0,0.0,1.0,0.784362,0.0,0.839987
3,0.185447,0.139665,0.784362,1.0,0.109653,0.933779
4,0.807539,0.608181,0.0,0.109653,1.0,0.0
5,0.0,0.0,0.839987,0.933779,0.0,1.0


## Clustering documents using similarity features

In [19]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2)
km.fit_transform(similarity_df)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

# ha agrupado correctamente en 2 grupos que coinciden con la categoría inicial


Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animals,1


# Topic models

In [20]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=2, max_iter=100, random_state=42)
dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1', 'T2'])
features

# Los dos tópicos suman 1
# El tópico mayoritario te da la clase única
# Lo interesante es ver la proporción de cada tópico


Unnamed: 0,T1,T2
0,0.190548,0.809452
1,0.176804,0.823196
2,0.846184,0.153816
3,0.814863,0.185137
4,0.180516,0.819484
5,0.839172,0.160828


## Show topics and their weights

In [21]:
tt_matrix = lda.components_
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
    topic = sorted(topic, key=lambda x: -x[1]) # ordena por el peso decreciente
    topic = [item for item in topic if item[1] > 0.6] # me quedo solo con los pesos mayores de 0.6
    print(topic)
    print()

# imprimo el vocabulario perteneciente a cada tópico


[('brown', 1.7273638692668465), ('dog', 1.7273638692668465), ('fox', 1.7273638692668465), ('lazy', 1.7273638692668465), ('quick', 1.7273638692668465), ('jumps', 1.0328325272484777), ('blue', 0.7731573162915626)]

[('sky', 2.264386643135622), ('beautiful', 1.9068269319456903), ('blue', 1.7996282104933266), ('love', 1.148127242397004), ('today', 1.0068251160429935)]



## Clustering documents using topic model features

In [22]:
km = KMeans(n_clusters=2)
km.fit_transform(features)
cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

# ahora agrupo en clusters usando los tópicos, no la similaridad basada en cosenos


Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,1
1,Love this blue and beautiful sky!,weather,1
2,The quick brown fox jumps over the lazy dog.,animals,0
3,The brown fox is quick and the blue dog is lazy!,animals,0
4,The sky is very blue and the sky is very beaut...,weather,1
5,The dog is lazy but the brown fox is quick!,animals,0


# Word Embeddings

In [23]:
# Ahora voy a trabajar mapear palabras en vectores

from gensim.models import word2vec

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

# Set values for various parameters
feature_size = 10    # Word vector dimensionality  
window_context = 10          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count = min_word_count,
                          sample=sample)

size: The number of dimensions of the embeddings and the default is 100.

window: The maximum distance between a target word and words around the target word. The default window is 5.

min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.

In [24]:
w2v_model.wv['sky']
# de palabra a vector

array([ 0.02958306,  0.01569223, -0.0397306 , -0.04635359,  0.02098968,
        0.03433336,  0.04283854, -0.00582395, -0.02875275, -0.03310678],
      dtype=float32)

In [25]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model.wv, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [26]:
w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,
                                             num_features=feature_size)
pd.DataFrame(w2v_feature_array)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.00544,0.03373,-0.025557,-0.02425,-0.015972,0.008536,0.01559,-0.012016,-0.018383,0.005451
1,-0.004103,0.035245,-0.027572,-0.02528,-0.006564,0.013527,0.015336,-0.004456,-0.010757,-0.001686
2,-0.003408,-0.008352,0.009903,-0.006879,-0.008577,0.002692,-0.000827,-0.011865,0.00809,0.022592
3,-0.009232,0.007042,0.006373,-0.019641,-0.008787,-0.001793,0.003127,-0.015953,-0.0017,0.026942
4,0.00799,0.025386,-0.027527,-0.032535,-0.009221,0.009717,0.024209,-0.012022,-0.016781,-0.003471
5,-0.008839,-0.000949,0.0118,-0.017105,-0.003797,-0.000184,-0.00299,-0.012817,0.003532,0.024598


In [27]:
from sklearn.cluster import AffinityPropagation

# en affinitypropagation no se especifica el número de clusters, lo encuentra él

ap = AffinityPropagation()
ap.fit(w2v_feature_array)
cluster_labels = ap.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animals,1
