# Import necessary dependencies and settings

In [1]:
import pandas as pd 
import numpy as np
import re
import nltk 

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pilar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Sample corpus of text documents

In [2]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']


corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 'Category':labels})
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick!,animals


# Simple text pre-processing

In [3]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
#Lower case and remove special characters/whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]','',doc, re.I)

    # ^ que no sea, \s es espacio en blanco, lo cambia a ''
    # re.I es ignore case

    doc = doc.lower()
    doc = doc.strip()
    #Tokenize document
    tokens = wpt.tokenize(doc)

    print(tokens)

    #filter stopwords out of documents
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # re-create document from filtered tokens
    print(filtered_tokens)

    doc = ' '.join(filtered_tokens) #join junta todos los iterables en un string
    return doc
    

In [4]:
normalize_corpus = np.vectorize(normalize_document)

Veamos qué hace np.vectorize().

Imaginemos una función que acepta un número y devuelve True o False si el número es o no par

In [5]:
def is_even_single(n):
    if n%2 == 0:
        return True
    else:
        return False

In [6]:
# Probemos la función con un par de valores:
is_even_single(4)


True

In [7]:
is_even_single(7)

False

Esta función acepta un escalar, por lo que intentar usarla con un array NumPy devolverá un error. Pero podemos "vectorizarla" con la función np.vectorize

In [8]:
is_even = np.vectorize(is_even_single)

In [9]:
# Ahora ya es posible usarla con arrays
arr = np.array([1,2,3,5,7,10,11])

In [10]:
is_even(arr)

array([False,  True, False, False, False,  True, False])

Volvamos a procesamiento de Texto.

In [11]:
corpus

array(['The sky is blue and beautiful.',
       'Love this blue and beautiful sky!',
       'The quick brown fox jumps over the lazy dog.',
       'The brown fox is quick and the blue dog is lazy!',
       'The sky is very blue and the sky is very beautiful today',
       'The dog is lazy but the brown fox is quick!'], dtype='<U56')

In [12]:
frase = 'The sky is blue and beautiful.'
#Que hace la función 'normalize_document(doc)'
normalize_document(frase)

['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']


'sky blue beautiful'

In [16]:
norm_corpus = normalize_corpus(corpus_df['Document'])
normalize_corpus(corpus)

['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']
['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']
['love', 'this', 'blue', 'and', 'beautiful', 'sky']
['love', 'blue', 'beautiful', 'sky']
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
['the', 'brown', 'fox', 'is', 'quick', 'and', 'the', 'blue', 'dog', 'is', 'lazy']
['brown', 'fox', 'quick', 'blue', 'dog', 'lazy']
['the', 'sky', 'is', 'very', 'blue', 'and', 'the', 'sky', 'is', 'very', 'beautiful', 'today']
['sky', 'blue', 'sky', 'beautiful', 'today']
['the', 'dog', 'is', 'lazy', 'but', 'the', 'brown', 'fox', 'is', 'quick']
['dog', 'lazy', 'brown', 'fox', 'quick']
['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']
['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']
['love', 'this', 'blue', 'and', 'beautiful', 'sky']
['love', 'blue', 'beautiful', 'sky']
['t

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U30')

In [17]:
normalize_corpus(corpus_df['Document'])

['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']
['the', 'sky', 'is', 'blue', 'and', 'beautiful']
['sky', 'blue', 'beautiful']
['love', 'this', 'blue', 'and', 'beautiful', 'sky']
['love', 'blue', 'beautiful', 'sky']
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
['the', 'brown', 'fox', 'is', 'quick', 'and', 'the', 'blue', 'dog', 'is', 'lazy']
['brown', 'fox', 'quick', 'blue', 'dog', 'lazy']
['the', 'sky', 'is', 'very', 'blue', 'and', 'the', 'sky', 'is', 'very', 'beautiful', 'today']
['sky', 'blue', 'sky', 'beautiful', 'today']
['the', 'dog', 'is', 'lazy', 'but', 'the', 'brown', 'fox', 'is', 'quick']
['dog', 'lazy', 'brown', 'fox', 'quick']


array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U30')

In [18]:
norm_corpus

array(['sky blue beautiful', 'love blue beautiful sky',
       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',
       'sky blue sky beautiful today', 'dog lazy brown fox quick'],
      dtype='<U30')

# Bag of Words Model

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0., max_df=1.) 
#min_df, porcentaje de palabras que descartas cuando se repiten por debjao de ese tanto porciento
#max_df, porcentaje de palabras que se repiten y descartas por encima de ese tanto porciento
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1],
       [0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [20]:
vocabulary = cv.get_feature_names()
pd.DataFrame(cv_matrix,columns=vocabulary)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,1,1,0,0,0,0,0,0,0,1,0
1,1,1,0,0,0,0,0,1,0,1,0
2,0,0,1,1,1,1,1,0,1,0,0
3,0,1,1,1,1,0,1,0,1,0,0
4,1,1,0,0,0,0,0,0,0,2,1
5,0,0,1,1,1,0,1,0,1,0,0


# Bag of N-Grams Model

In [21]:
bv = CountVectorizer(ngram_range=(2,2))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocabulary = bv.get_feature_names()
bigramas = pd.DataFrame(bv_matrix, columns=vocabulary)
bigramas.shape

(6, 17)

In [22]:
bigramas

Unnamed: 0,beautiful sky,beautiful today,blue beautiful,blue dog,blue sky,brown fox,dog lazy,fox jumps,fox quick,jumps lazy,lazy brown,lazy dog,love blue,quick blue,quick brown,sky beautiful,sky blue
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0
3,0,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1
5,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0


# TF-IDF Model

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
tv =  TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocabulary = tv.get_feature_names()

pd.DataFrame(np.round(tv_matrix, 2), columns=vocabulary)

#TF - IDF = Frecuencia - inversa = % aparición * log(N/documentos en los que aparece)

Unnamed: 0,beautiful,blue,brown,dog,fox,jumps,lazy,love,quick,sky,today
0,0.6,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0
1,0.46,0.39,0.0,0.0,0.0,0.0,0.0,0.66,0.0,0.46,0.0
2,0.0,0.0,0.38,0.38,0.38,0.54,0.38,0.0,0.38,0.0,0.0
3,0.0,0.36,0.42,0.42,0.42,0.0,0.42,0.0,0.42,0.0,0.0
4,0.36,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.52
5,0.0,0.0,0.45,0.45,0.45,0.0,0.45,0.0,0.45,0.0,0.0


# Document Similarity

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

#
similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.753128,0.0,0.185447,0.807539,0.0
1,0.753128,1.0,0.0,0.139665,0.608181,0.0
2,0.0,0.0,1.0,0.784362,0.0,0.839987
3,0.185447,0.139665,0.784362,1.0,0.109653,0.933779
4,0.807539,0.608181,0.0,0.109653,1.0,0.0
5,0.0,0.0,0.839987,0.933779,0.0,1.0


## Clustering documents using similarity features

In [29]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2)

km.fit_transform(similarity_df)

cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels],axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animals,1


# Topic models

In [31]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=2, random_state=42)

dt_matrix = lda.fit_transform(tv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1','T2'])
features


#La suma de los topics es '1'
#El tópico mayoritario te dá la idoneidad de 'tema'


Unnamed: 0,T1,T2
0,0.190548,0.809452
1,0.176804,0.823196
2,0.846184,0.153816
3,0.814862,0.185138
4,0.180516,0.819484
5,0.839172,0.160828


In [None]:
#Con esta clase/objeto LatentDirichletAllocatio lo que se considera de topic01 o topic02



# En el documento posición'0', estaría mejor ubicado en el topic02

## Show topics and their weights

In [34]:
tt_matrix = lda.components_
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocabulary, topic_weights)]
    topic = sorted(topic, key = lambda x:-x[1]) #Ordena por el peso decreciente
    topic = [item for item in topic if item[1] > 0.6] #selecciono sólo los que tengan peso superior a '0.6'
    print(topic)
    print()

    #Imprimo el vocabulario perteneciente a cada tópico

[('fox', 1.7273626327189195), ('quick', 1.7273626327187892), ('dog', 1.7273626327142786), ('brown', 1.7273626327126588), ('lazy', 1.7273626327117604), ('jumps', 1.0328362023173896), ('blue', 0.773150134888545)]

[('sky', 2.264386986885355), ('beautiful', 1.9068272074987158), ('blue', 1.7996353918963441), ('love', 1.1481276316057776), ('today', 1.006825656244487)]



## Clustering documents using topic model features

In [35]:
km = KMeans(n_clusters=2)
km.fit_transform(features)

cluster_labels = km.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,1
1,Love this blue and beautiful sky!,weather,1
2,The quick brown fox jumps over the lazy dog.,animals,0
3,The brown fox is quick and the blue dog is lazy!,animals,0
4,The sky is very blue and the sky is very beaut...,weather,1
5,The dog is lazy but the brown fox is quick!,animals,0


# Word Embeddings

In [40]:
# Ahora voy a trabajar mapear palabras en vectores

from gensim.models import word2vec

wpt = nltk.WordPunctTokenizer()

tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]

#set values for varius parameters
feature_size = 10 #Word vector dimensionality
windon_context = 10 # contex window size
min_word_count = 1 #minimum word count
sample = 1e-3 #downsample setting for frecuency words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size= feature_size, window = windon_context, min_count = min_word_count, sample= sample)

size: The number of dimensions of the embeddings and the default is 100.

window: The maximum distance between a target word and words around the target word. The default window is 5.

min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.

In [42]:
w2v_model.wv['sky']
# de palabra a vector

array([ 0.0116524 , -0.01631303, -0.04014484,  0.04154879, -0.00699574,
       -0.04670414,  0.02437033, -0.0028849 , -0.01926388,  0.01163094],
      dtype=float32)

In [43]:

def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model.wv, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


In [44]:
w2v_feature_array = averaged_word_vectorizer(corpus= tokenized_corpus, model= w2v_model, num_features=feature_size)

pd.DataFrame(w2v_feature_array)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.000613,0.012141,-0.024237,0.019238,-0.027568,-0.032937,0.002668,-0.011774,-0.025998,0.005634
1,-0.001656,0.012552,-0.01673,0.026087,-0.029727,-0.018031,-0.002355,-0.020488,-0.010081,0.016111
2,0.00181,-0.017766,0.01428,0.014213,-0.008555,0.009012,0.017099,-0.007861,-0.011502,-0.000223
3,0.005197,-0.009226,0.019654,0.016757,-0.014604,0.00262,0.011715,-0.011213,-0.017233,-0.006948
4,0.005083,-0.005123,-0.01873,0.019763,-0.025132,-0.036749,0.013528,-0.001899,-0.018994,0.013772
5,0.011953,-0.017411,0.02214,0.012934,-0.011733,0.012206,0.011145,-0.004556,-0.014085,-0.000244


In [None]:
#La distancia euclidea entre 2 puntos.
#sumas las diferencias de coordenadas al cuadrado y de lo que obtienes haces la raiz cuadrada... Obtendrías la distancia más corta entre ellos


In [46]:
from sklearn.cluster import AffinityPropagation

# en affinitypropagation no se especifica el número de clusters, lo encuentra él

ap = AffinityPropagation()
ap.fit(w2v_feature_array)
cluster_labels = ap.labels_
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])

pd.concat([corpus_df, cluster_labels], axis=1)

Unnamed: 0,Document,Category,ClusterLabel
0,The sky is blue and beautiful.,weather,0
1,Love this blue and beautiful sky!,weather,0
2,The quick brown fox jumps over the lazy dog.,animals,1
3,The brown fox is quick and the blue dog is lazy!,animals,1
4,The sky is very blue and the sky is very beaut...,weather,0
5,The dog is lazy but the brown fox is quick!,animals,1
