In [66]:
import pandas as pd
import nltk
from nltk import SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/sofia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
originData = pd.read_csv("../data/interim/dataset_v2.csv")

In [68]:
originData.head()

Unnamed: 0.1,Unnamed: 0,manually_assigned_tag,url,content
0,0,supermarket,https://www.walmart.com.ar,nueva nueva correctamente puede cerrar lacteos...
1,0,supermarket,https://www.makro.com.ar/ofertas,logo entra cuenta quiero fornecedor sustentabi...
2,0,supermarket,https://www.alvearsupermercados.com.ar/ofertas/,alvearsupermercados logo blanco alvearsupermer...
3,0,supermarket,https://www.cotodigital3.com.ar/sitios/cdigi/b...,experiencia descuentos descuentos comparacione...
4,0,telephone,https://www.personal.com.ar/,micuenta destinos cuenta micuenta cuenta cuota...


In [69]:
originData = originData.drop(columns=['Unnamed: 0'])

In [70]:
originData.columns

Index(['manually_assigned_tag', 'url', 'content'], dtype='object')

In [71]:
originData.head()

Unnamed: 0,manually_assigned_tag,url,content
0,supermarket,https://www.walmart.com.ar,nueva nueva correctamente puede cerrar lacteos...
1,supermarket,https://www.makro.com.ar/ofertas,logo entra cuenta quiero fornecedor sustentabi...
2,supermarket,https://www.alvearsupermercados.com.ar/ofertas/,alvearsupermercados logo blanco alvearsupermer...
3,supermarket,https://www.cotodigital3.com.ar/sitios/cdigi/b...,experiencia descuentos descuentos comparacione...
4,telephone,https://www.personal.com.ar/,micuenta destinos cuenta micuenta cuenta cuota...


In [72]:
%%writefile Stemmer.py
import nltk
from collections import defaultdict
from nltk import SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
class Stemmer(): # naive class
    def __init__(self):
        self.stems = list()
        self.stemmed_freqs = defaultdict(list)
        self.tokens = list()

    def tokenize(self, document):
        '''
        tokenize document and remove (some of) non-spanish words
        '''
        if not isinstance(document, list):
            if isinstance(document, str):
                self.tokens = document.split(' ')
            else:
                raise TypeError('str or dict expected, {} found.'.format(type(document)))
        else:
            self.tokens = document
    
    def stemm(self, document):
        '''
        document stemming
        '''
        if len(self.tokens) == 0: #empty list
            self.tokenize(document)
        else:
            spanishstemmer = SnowballStemmer('spanish')
            self.stems = [spanishstemmer.stem(token) for token in self.tokens] 

Overwriting Stemmer.py


In [73]:
# Delete rows with empty 'content' field
originData.dropna(subset = ["content"], inplace=True)

In [74]:
def remove_short_words(stringObj, wordLong = 4):
    '''
    remove words with lenght <= wordLong
    
    return: list
    '''
    listObj = stringObj.split(' ')
    words = list()
    words = [word for word in listObj if len(word)>wordLong]
    return words

In [75]:
def remove_less_frequent_words(myList, frequency=10):
    from collections import Counter
    words_freq = Counter(myList)
    result = list()
    words = [[elem[0]]*elem[1] for elem in words_freq.items() if elem[1] >= frequency]    
    for val in words:
        result.extend(val)
    return result    

In [76]:
originData['content'] = originData['content'].map(lambda x: remove_short_words(x))

In [77]:
myDF = originData.copy(deep=True)
myDF = myDF.drop(columns=['manually_assigned_tag'])

In [78]:
myDict = dict(zip(myDF.url, myDF.content)) # -> key: url, value:content (list)

# Sparse document-term matrix

In [79]:
n_nonzero = 0
vocab = set()
for docterms in myDict.values():
    unique_terms = set(docterms)    # all unique terms of this doc
    vocab |= unique_terms           # set union: add unique terms of this doc
    n_nonzero += len(unique_terms)  # add count of unique terms in this doc

# make a list of document names
# the order will be the same as in the dict
docnames = list(myDict.keys())

In [80]:
import numpy as np
docnames = np.array(docnames)
vocab = np.array(list(vocab))

In [81]:
vocab_sorter = np.argsort(vocab)    # indices that sort "vocab"

In [82]:
# The dimensions of the matrix will be len(docnames) x len(vocab)
ndocs = len(docnames)
nvocab = len(vocab)

In [83]:
data = np.empty(n_nonzero, dtype=np.intc)     # all non-zero term frequencies at data[k]
rows = np.empty(n_nonzero, dtype=np.intc)     # row index for kth data item (kth term freq.)
cols = np.empty(n_nonzero, dtype=np.intc)     # column index for kth data item (kth term freq.)

In [84]:
ind = 0     # current index in the sparse matrix data
# go through all documents with their terms
for docname, terms in myDict.items():
    # find indices into  such that, if the corresponding elements in  were
    # inserted before the indices, the order of  would be preserved
    # -> array of indices of  in 
    term_indices = vocab_sorter[np.searchsorted(vocab, terms, sorter=vocab_sorter)]

    # count the unique terms of the document and get their vocabulary indices
    uniq_indices, counts = np.unique(term_indices, return_counts=True)
    n_vals = len(uniq_indices)  # = number of unique terms
    ind_end = ind + n_vals  #  to  is the slice that we will fill with data

    data[ind:ind_end] = counts                  # save the counts (term frequencies)
    cols[ind:ind_end] = uniq_indices            # save the column index: index in 
    doc_idx = np.where(docnames == docname)     # get the document index for the document name
    rows[ind:ind_end] = np.repeat(doc_idx, n_vals)  # save it as repeated value

    ind = ind_end  # resume with next document -> add data to the end

In [85]:
from scipy.sparse import coo_matrix
dtm = coo_matrix((data, (rows, cols)), shape=(ndocs, nvocab), dtype=np.intc)

In [94]:
tags = originData.groupby(['manually_assigned_tag'])['manually_assigned_tag'].nunique().count()

In [95]:
tags

20

In [97]:
import lda
model = lda.LDA(n_topics=tags, n_iter=1000, random_state=1)
model.fit(dtm)
topic_word = model.topic_word_
n_top_words = 10
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

INFO:lda:n_documents: 63
INFO:lda:vocab_size: 2120
INFO:lda:n_words: 18673
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1000
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -209409
INFO:lda:<10> log likelihood: -112653
INFO:lda:<20> log likelihood: -108564
INFO:lda:<30> log likelihood: -106970
INFO:lda:<40> log likelihood: -105691
INFO:lda:<50> log likelihood: -105243
INFO:lda:<60> log likelihood: -104789
INFO:lda:<70> log likelihood: -104609
INFO:lda:<80> log likelihood: -104280
INFO:lda:<90> log likelihood: -103627
INFO:lda:<100> log likelihood: -103060
INFO:lda:<110> log likelihood: -103040
INFO:lda:<120> log likelihood: -102659
INFO:lda:<130> log likelihood: -102151
INFO:lda:<140> log likelihood: -101763
INFO:lda:<150> log likelihood: -101616
INFO:lda:<160> log likelihood: -101051
INFO:lda:<170> log likelihood: -101075
INFO:lda:<180> log likelihood: -101180
INFO:lda:<190> log likelihood: -101016
INFO:lda:<200> log likelihood: -100999
INFO:lda:<210> lo

Topic 0: suplementos ciento libros presidente logos fuente universidad elena nuevo nicolas
Topic 1: buenos aeropuerto deseos barcelona ciudad internacional destinos cuadras aeropuertos nueva
Topic 2: banco beneficios exterior financiacion canales internacionalizacion personas cuenta productos acerca
Topic 3: servicios sucesos internacionales escenarios nacionales actualidad tabla deportes quedo muerte
Topic 4: mercadolibre report close contact respond cancel espera videos reported design
Topic 5: cordoba florida places puerto barcelona mexico london carnes carlos social
Topic 6: elgatoylacaja video edcep edcio cambio nueva mexico libros unidos escuela
Topic 7: deshacer gracias cuenta cerrar respuestas informacion puede enlace responder cualquier
Topic 8: internacional nuevo basicas esencial tambien despues precios hacer nicolas celebridades
Topic 9: macro conocenos canales cparentrq financiaciones propuestas exportacion sueldos equipment servicios
Topic 10: servicios productos cuotas p

In [None]:
data.to_csv('../data/processed/processed_dataset.csv')

In [None]:
data['content'] = data['content'].map(lambda x: remove_less_frequent_words(x, 2))

In [None]:
data.to_csv('../data/processed/processed_dataset_less_frequent_words_removed.csv')