# Tidene Códigos - Vetorizadores

In [68]:
import gensim
import nltk
import numpy as np
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
nltk.download('punkt')
nltk.download('wordnet')

pd_corpus =  pd.read_csv('data/train_min.csv',encoding='utf8')
pd_corpus , corpus_test =  train_test_split(pd_corpus, train_size=0.7)
classes_train = pd_corpus['sentiment']
corpus = pd_corpus['review']
corpus_test = corpus_test['review']



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Classe que encapsula o tipo de tokenização e sequencia de limpeza a ser realizada nos textos

#### Utilitarios - lê corpus e tokenizadores

In [3]:

import csv

class readCorpus(object):
    def __init__(self,csvfile,list_of_fields_to_read=[],tokenizer=None,encoding='utf8'):
        self.csvfile = csvfile
        self.fields = list_of_fields_to_read
        self.tokenizer = tokenizer
        self.encoding = encoding
    
    def __iter__(self):
        f = open(self.csvfile,encoding=self.encoding, errors='ignore')
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) #separador dos campos\n",
        headers = next(reader, None)
        if (len(self.fields) <= 0):
            self.fields = headers
        selected_field_indexes = []
        for idx,field in enumerate(headers):
            if field in self.fields:
                selected_field_indexes.append(idx)

        for line in reader:
            if line:
                yield [line[idx] for idx in selected_field_indexes] if (len(selected_field_indexes)>1) else (line[selected_field_indexes[0]] if not self.tokenizer else tokenizer.tokenize(line[selected_field_indexes[0]]))
                        

In [64]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import *    #https://www.nltk.org/api/nltk.tokenize.html

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]

class StemTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        
        return [self.stemmer.stem(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]


In [14]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Define um vetorizador do tipo contagem de frequência (bag-of-words)

In [30]:
#http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words=stop_words) # usa a classe de tokenizacao definida acima


#### Passa os textos pelo vetorizador

In [31]:
#corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])

X_train_counts = count_vectorizer.fit_transform(corpus) # resulta em uma matriz sparsa numpy

In [32]:
print(X_train_counts[0])

  (0, 29484)	1
  (0, 17179)	1
  (0, 32787)	1
  (0, 14532)	1
  (0, 33716)	1
  (0, 7838)	1
  (0, 7630)	1
  (0, 7038)	1
  (0, 16959)	1
  (0, 25475)	2
  (0, 15513)	1
  (0, 28997)	1
  (0, 34656)	1
  (0, 15071)	1
  (0, 38371)	1
  (0, 13245)	1
  (0, 20060)	1
  (0, 5530)	1
  (0, 19036)	2
  (0, 12892)	1
  (0, 14944)	1
  (0, 30161)	1
  (0, 34601)	1
  (0, 17796)	1
  (0, 14028)	1
  :	:
  (0, 30158)	1
  (0, 17180)	1
  (0, 24456)	1
  (0, 32270)	1
  (0, 4432)	1
  (0, 32765)	1
  (0, 37290)	1
  (0, 10716)	1
  (0, 18841)	1
  (0, 9767)	1
  (0, 12765)	1
  (0, 1666)	1
  (0, 37446)	1
  (0, 3055)	1
  (0, 30454)	1
  (0, 25923)	1
  (0, 30601)	1
  (0, 20938)	3
  (0, 11278)	1
  (0, 30789)	1
  (0, 36682)	1
  (0, 26061)	1
  (0, 26692)	1
  (0, 1339)	1
  (0, 34344)	1


#### O vetorizador também pode ser utilizado para transformar um texto não visto

In [33]:
texto_nao_visto = count_vectorizer.transform(['New film review'])
print(texto_nao_visto)

  (0, 13204)	1
  (0, 23873)	1
  (0, 29017)	1


#### A partir daí temos

In [34]:
print("Formato Matriz sparsa gerada (numdocs,features) ==> ", X_train_counts.shape)
print("Representacao de um documento (o 6º) ==> ", X_train_counts[5])
print("Indice de uma palavra ('film') ==>", count_vectorizer.vocabulary_.get('film'))


Formato Matriz sparsa gerada (numdocs,features) ==>  (5250, 38662)
Representacao de um documento (o 6º) ==>    (0, 22097)	1
  (0, 7439)	1
  (0, 20334)	1
  (0, 17742)	1
  (0, 17295)	2
  (0, 3621)	1
  (0, 18988)	1
  (0, 26390)	1
  (0, 25647)	1
  (0, 27225)	1
  (0, 6299)	1
  (0, 13754)	1
  (0, 34586)	1
  (0, 7833)	1
  (0, 5558)	1
  (0, 18771)	3
  (0, 10086)	1
  (0, 11379)	1
  (0, 5704)	1
  (0, 23562)	1
  (0, 5435)	1
  (0, 22800)	1
  (0, 13624)	2
  (0, 3307)	1
  (0, 15308)	1
  :	:
  (0, 24093)	1
  (0, 7832)	2
  (0, 6766)	3
  (0, 20398)	1
  (0, 7842)	1
  (0, 3954)	1
  (0, 17205)	1
  (0, 21225)	1
  (0, 32808)	1
  (0, 1649)	1
  (0, 3222)	1
  (0, 7144)	4
  (0, 16147)	1
  (0, 21577)	3
  (0, 35546)	1
  (0, 20904)	2
  (0, 1942)	1
  (0, 28112)	1
  (0, 31237)	1
  (0, 35634)	1
  (0, 34608)	2
  (0, 29876)	1
  (0, 14630)	1
  (0, 24638)	1
  (0, 20478)	1
Indice de uma palavra ('film') ==> 13204


#### Explorando as contagens (palavras mais frequentes no texto)

In [35]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_words = X_train_counts.sum(axis=0)    #bag_of_words
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [36]:
words_freq

[('movie', 10845),
 ('wa', 10274),
 ('film', 9744),
 ('one', 5853),
 ('like', 4396),
 ('ha', 3562),
 ('time', 3389),
 ('good', 3193),
 ('character', 2944),
 ('story', 2805),
 ('even', 2658),
 ('make', 2645),
 ('would', 2599),
 ('get', 2547),
 ('see', 2536),
 ('really', 2421),
 ('scene', 2312),
 ('well', 2295),
 ('much', 2123),
 ('people', 2033),
 ('first', 1964),
 ('also', 1945),
 ('bad', 1940),
 ('great', 1909),
 ('show', 1884),
 ('way', 1852),
 ('made', 1821),
 ('thing', 1780),
 ('life', 1719),
 ('think', 1641),
 ('could', 1640),
 ('know', 1536),
 ('love', 1533),
 ('plot', 1518),
 ('two', 1483),
 ('watch', 1444),
 ('seen', 1432),
 ('acting', 1427),
 ('end', 1426),
 ('year', 1416),
 ('actor', 1414),
 ('look', 1412),
 ('many', 1408),
 ('best', 1402),
 ('say', 1391),
 ('never', 1377),
 ('little', 1359),
 ('doe', 1318),
 ('ever', 1295),
 ('come', 1232),
 ('better', 1225),
 ('still', 1217),
 ('take', 1197),
 ('man', 1191),
 ('work', 1183),
 ('find', 1130),
 ('back', 1095),
 ('part', 1086)

### Define um vetorizador do tipo contagem de frequência de bi-gramas (na tentativa de juntar palavras que aparecem sempre juntas)


In [37]:
#http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import CountVectorizer

bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), tokenizer=LemmaTokenizer(), stop_words='english', min_df=1)

In [38]:
#corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])
X_train_bigram_counts = bigram_vectorizer.fit_transform(corpus) # resulta em uma matriz sparsa numpy
#print('Dicionario: %s' %bigram_vectorizer.get_feature_names())

#### Explorando as contagens (palavras mais frequentes no texto)

In [39]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_bigrams = X_train_bigram_counts.sum(axis=0)    #bag_of_words
bigram_freq = [(bigram, sum_bigrams[0, idx]) for bigram, idx in bigram_vectorizer.vocabulary_.items()]
bigram_freq =sorted(bigram_freq, key = lambda x: x[1], reverse=True)

In [40]:
bigram_freq

[('movie wa', 614),
 ('film wa', 459),
 ("i've seen", 377),
 ('look like', 329),
 ('year old', 260),
 ('special effect', 233),
 ("don't know", 222),
 ('movie like', 206),
 ('wa just', 200),
 ('film ha', 200),
 ('movie ha', 188),
 ('low budget', 181),
 ('main character', 180),
 ('wa good', 179),
 ('waste time', 176),
 ('good movie', 173),
 ("i'm sure", 173),
 ('horror movie', 171),
 ("it's just", 169),
 ('thought wa', 168),
 ('horror film', 163),
 ('make movie', 159),
 ('movie just', 152),
 ('watch movie', 150),
 ("don't think", 146),
 ('bad guy', 146),
 ('like movie', 145),
 ('film like', 142),
 ('year ago', 140),
 ('new york', 136),
 ('real life', 135),
 ('acting wa', 135),
 ('bad movie', 135),
 ('wa really', 133),
 ('worst movie', 129),
 ('watching movie', 128),
 ('make sense', 126),
 ('good film', 125),
 ('high school', 125),
 ("film it's", 125),
 ('feel like', 122),
 ("movie it's", 121),
 ('saw movie', 119),
 ('pretty good', 116),
 ('time wa', 116),
 ('make film', 115),
 ('movie mo

### Define um vetorizador do tipo TFIDF

In [41]:
# http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', min_df=1)

#### Cria um vetorizador já ajustado ao texto

In [45]:
import pandas as pd
import pickle

#corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])
classes = pd_corpus['sentiment'].values.tolist()

## NOS OUTROS EXEMPLOS ESSAS DUAS ACOES (FIT E TRANSFORM) SAO FEITAS EM UMA SO, ATRAVES DO FIT_TRANSFORM ..
##   FIZ A SEPARACAO AQUI PARA PODER GRAVAR NO DISCO O VETORIZADOR E PODER USA-LO PARA VETORIZAR TEXTOS NOVOS DEPOIS
tfidf_vectorizer = tfidf_vectorizer.fit(corpus,classes) # treina o vetorizador
X_train_tfidf = tfidf_vectorizer.transform(corpus) # transforma os textos em uma matriz sparsa numpy

# gravando no disco vetorizador e a matriz vetorizada
pickle.dump(tfidf_vectorizer, open("tfidf_vectorizer.pickle", "wb"))
pickle.dump(X_train_tfidf, open("X_train_tfidf.pickle", "wb"))

In [46]:
print("Formato Matriz sparsa gerada (numdocs,features) ==> ", X_train_tfidf.shape)
print("Indice de uma palavra ==>", tfidf_vectorizer.vocabulary_.get('film'))


Formato Matriz sparsa gerada (numdocs,features) ==>  (5250, 38515)
Indice de uma palavra ==> 13152


In [47]:
print("Representacao de um documento ==> ", X_train_tfidf[5])

Representacao de um documento ==>    (0, 37019)	0.1949139674373265
  (0, 36435)	0.076968898640846
  (0, 35498)	0.060520568074789574
  (0, 35410)	0.061846737724247085
  (0, 34482)	0.0881957374000952
  (0, 34462)	0.10090544103623779
  (0, 34188)	0.060964316958851746
  (0, 32690)	0.10474356340279536
  (0, 29774)	0.06375327544739248
  (0, 28646)	0.1085409815238562
  (0, 28579)	0.1429898381151196
  (0, 28010)	0.03949709403107791
  (0, 27125)	0.12448663333228342
  (0, 25690)	0.1695775205591872
  (0, 25550)	0.10390755607675858
  (0, 24495)	0.08457762886434839
  (0, 24006)	0.07771391378917519
  (0, 22724)	0.1429898381151196
  (0, 22528)	0.0642857860139621
  (0, 22025)	0.06285774200961394
  (0, 20834)	0.09403524264584336
  (0, 20408)	0.03135081567054927
  (0, 20328)	0.11406676345073721
  (0, 20264)	0.08543354856550149
  (0, 18921)	0.10432069879383643
  :	:
  (0, 17138)	0.0519649259724265
  (0, 16086)	0.082600811939102
  (0, 15242)	0.10390755607675858
  (0, 13698)	0.09361886135210568
  (0, 13568

In [51]:
print('Dicionario: %s' %tfidf_vectorizer.get_feature_names())



#### Explorando os dados (ordenando as palavras com maior tfidf nos documentos)

In [52]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_idfs = X_train_tfidf.sum(axis=0)    #tfidfs
words_idfs = [(word, sum_idfs[0, idx]) for word, idx in tfidf_vectorizer.vocabulary_.items()]
words_idfs =sorted(words_idfs, key = lambda x: x[1], reverse=True)

In [53]:
words_idfs

[('movie', 294.1303382547381),
 ('wa', 250.43559356691551),
 ('film', 242.64181870683996),
 ('like', 124.21612599663841),
 ('just', 116.5410511790058),
 ("it's", 116.43253564234844),
 ('good', 109.33239072859257),
 ('time', 105.1190785284768),
 ('ha', 100.40894449039195),
 ('story', 99.12119578645323),
 ('character', 97.80445823391307),
 ('really', 91.57097032193151),
 ('make', 88.85342144428904),
 ('bad', 85.25440959482644),
 ('scene', 81.53353409919012),
 ('people', 80.27693513231416),
 ('great', 80.18280951251361),
 ("don't", 75.97585993552111),
 ('think', 69.4820483627028),
 ('watch', 68.81553855491336),
 ('thing', 68.23020474933736),
 ('plot', 67.83359733319487),
 ('way', 67.50895072240871),
 ('acting', 66.56382087376645),
 ('life', 65.53853879510385),
 ('seen', 65.21951594217937),
 ('actor', 64.60118874608791),
 ('love', 64.48105510169788),
 ('know', 62.68270049529318),
 ('best', 59.92991745818262),
 ('did', 59.16828430406264),
 ('say', 58.59161727465288),
 ('year', 58.1284504782

#### Montando vetorizador tfidf para bigramas

In [55]:
# http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_bigram_vectorizer = TfidfVectorizer(ngram_range=(2, 2), tokenizer=LemmaTokenizer(), stop_words='english') 


In [56]:
#corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])
X_train_bigram_tfidf = tfidf_bigram_vectorizer.fit_transform(corpus) # resulta em uma matriz sparsa numpy

In [57]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_bigrams_tfidf = X_train_bigram_tfidf.sum(axis=0)    #bag_of_words
bigram_tfidf_freq = [(bigram, sum_bigrams_tfidf[0, idx]) for bigram, idx in tfidf_bigram_vectorizer.vocabulary_.items()]
bigram_tfidf_freq =sorted(bigram_tfidf_freq, key = lambda x: x[1], reverse=True)

In [58]:
bigram_tfidf_freq

[('movie wa', 26.78675035157244),
 ('film wa', 18.707187097915426),
 ("i've seen", 17.508337750469924),
 ('look like', 13.24917218301878),
 ('year old', 12.026691143304337),
 ("don't know", 11.293124706093264),
 ('special effect', 11.025675566406107),
 ('waste time', 11.012536019265674),
 ('movie like', 10.924593989755063),
 ('wa just', 10.43606114712343),
 ('good movie', 10.195599436998735),
 ('wa good', 10.061686585109188),
 ('movie ha', 9.86039699931686),
 ('horror movie', 9.573153229461258),
 ('watch movie', 9.484931903446977),
 ('film ha', 9.41821225323887),
 ('main character', 9.31262009811545),
 ('worst movie', 9.192374577012625),
 ('thought wa', 9.157316105046869),
 ('low budget', 9.104425677262544),
 ('make movie', 9.077205306099824),
 ('movie just', 8.810577176674173),
 ('acting wa', 8.669111660888655),
 ("it's just", 8.589526240486364),
 ('bad movie', 8.365135188520485),
 ('horror film', 8.338367018074058),
 ('saw movie', 8.15976255615218),
 ("don't think", 8.10653009277941)

## Vetorização com o word2vec do Gensim

In [60]:
# utilizando um modelo word2vec previamente construido
import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-100")

In [61]:
#https://radimrehurek.com/gensim/models/word2vec.html
# representacao de uma palavra no formato word2vec = representado na dimensao 100 do modelo utilizado
print(word_vectors.get_vector('film'))

[ 0.19916  -0.049702  0.24579  -0.32281   0.89768  -0.1278   -0.49506
  0.20814  -0.20046  -0.20604   0.038292 -0.67277  -0.12689  -0.18766
 -0.10277   0.73128   0.82408   0.087288  0.69255   1.3107    0.49113
 -0.38097   0.24338  -0.27813   0.62506   0.35978   0.42041  -0.24529
  0.14861  -0.26726  -0.56262   0.63843  -0.54153   0.36537   0.20545
 -0.16604   0.72434   0.29961  -0.42501  -0.35932  -0.089288  0.48752
 -1.0927    0.88818   0.89941  -0.7541   -0.35492  -0.76396   0.27468
  0.2757   -0.48152  -0.41399   0.64489   1.148    -0.29131  -2.9387
 -0.83162   0.95586   1.1623   -0.42502   0.15486   2.2326   -0.31339
 -0.030228  0.79802  -0.41302   0.72885   0.7296   -0.31909   0.8956
  0.34625   0.2923    0.40056   0.78985  -0.43999   0.24698  -0.46548
  0.055886 -0.62603  -0.036487 -0.65429   0.10563   0.17435   0.35466
 -1.9403   -0.022502 -0.7302   -0.63042  -0.032799 -0.43953  -0.07239
 -0.44875  -0.074689 -0.14426   0.19252   0.27108   0.20325  -0.068109
  0.017651  0.06455 ]

In [65]:
## NAO UTILIZAR COM TOKENS STEMMIZADOS = PQ ELE NAO ACHARA MUITAS PALAVRAS
import numpy as np
class Word2VecVectorizer:
  def __init__(self,word_vectors):
    self.word_vectors = word_vectors

  def fit(self, lst_tokens):
    pass

  # para cada sentenca tokenizada ele representa cada palavra segundo a 
    #representacao w2v e depois tira a media de todas as palavras da sentenca
  def transform(self, lst_tokens): #pega uma lista de tokens
    self.D = word_vectors.get_vector(word_vectors.index2word[0]).shape[0]
    X = np.zeros((len(lst_tokens), self.D))
    n = 0
    emptycount = 0
    for tokens in lst_tokens:
      vecs = []
      m = 0
      for word in tokens:
        try:
            vec = self.word_vectors.get_vector(word)
            vecs.append(vec)
            m += 1
        except KeyError:
            #print('Palavra ',word,' nao pode ser representada')
            pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    return X


  def fit_transform(self, lst_tokens):
    self.fit(lst_tokens)
    return self.transform(lst_tokens)

### Utilizando para vetorizar os textos

In [66]:
# usa o dictionario e o bowcorpus para recuperar os tokens
# lembrando que o bowcorpus nao devera ser construido com stems ..ele pode nao conseguir representar muitas palavras
dictionary = gensim.corpora.Dictionary.load("dictionary.dict") #carrega o dicionario do disco
idx2wordDictionary = {k:v for k, v in dictionary.iteritems()} #mudando a ordem dos indices
lem = LemmaTokenizer()
bowcorpus = gensim.corpora.MmCorpus('bowcorpus.mm') # le o corpus representado em bag-of-words
tokens = [[idx2wordDictionary[idx] for idx,freq in text] for text in bowcorpus]
print(tokens[0])

NameError: name 'lem' is not defined

In [None]:
w2v_vectorizer_glove_wiki_gigaword_100 = Word2VecVectorizer(word_vectors)

X_train_w2v_glove_wiki_gigaword_100 = w2v_vectorizer_glove_wiki_gigaword_100.fit_transform(tokens)

#np.save('w2vmatrix.npy', textos_vetorizados) # salvando no disco

# gravando no disco vetorizador e a matriz vetorizada
pickle.dump(X_train_w2v_glove_wiki_gigaword_100, open("X_train_w2v_glove_wiki_gigaword_100.pickle", "wb"))



In [None]:
print(X_train_w2v_glove_wiki_gigaword_100[0]) #textos_vetorizados[0])

In [None]:
X_train_w2v_glove_wiki_gigaword_100.shape

#### Transformando textos de teste

In [None]:
corpus_test = readCorpus("testtoy.csv",list_of_fields_to_read=['data'])

lem = LemmaTokenizer()

test_lem_tokens = [lem(texto) for texto in corpus_test]
print(test_lem_tokens[0])


In [None]:
X_test_w2v_glove_wiki_gigaword_100 = w2v_vectorizer_glove_wiki_gigaword_100.fit_transform(test_lem_tokens)

# gravando no disco vetorizador e a matriz vetorizada
pickle.dump(X_test_w2v_glove_wiki_gigaword_100, open("X_test_w2v_glove_wiki_gigaword_100.pickle", "wb"))

