# Tidene Códigos - Vetorizadores

In [1]:
import gensim
import nltk
import numpy as np
import sklearn
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/andreiabonfante/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/andreiabonfante/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Classe que encapsula o tipo de tokenização e sequencia de limpeza a ser realizada nos textos

#### Utilitarios - lê corpus e tokenizadores

In [2]:

import csv

class readCorpus(object):
    def __init__(self,csvfile,list_of_fields_to_read=[],tokenizer=None,encoding='utf8'):
        self.csvfile = csvfile
        self.fields = list_of_fields_to_read
        self.tokenizer = tokenizer
        self.encoding = encoding
    
    def __iter__(self):
        f = open(self.csvfile,encoding=self.encoding, errors='ignore')
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) #separador dos campos\n",
        headers = next(reader, None)
        if (len(self.fields) <= 0):
            self.fields = headers
        selected_field_indexes = []
        for idx,field in enumerate(headers):
            if field in self.fields:
                selected_field_indexes.append(idx)

        for line in reader:
            if line:
                yield [line[idx] for idx in selected_field_indexes] if (len(selected_field_indexes)>1) else (line[selected_field_indexes[0]] if not self.tokenizer else tokenizer.tokenize(line[selected_field_indexes[0]]))
                        

In [3]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import *    #https://www.nltk.org/api/nltk.tokenize.html

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]

class StemTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]


In [4]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/andreiabonfante/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Define um vetorizador do tipo contagem de frequência (bag-of-words)

In [5]:
#http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words=stop_words) # usa a classe de tokenizacao definida acima


#### Passa os textos pelo vetorizador

In [6]:
corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])

X_train_counts = count_vectorizer.fit_transform(corpus) # resulta em uma matriz sparsa numpy

In [17]:
print(X_train_counts[0])

  (0, 29)	1
  (0, 436)	1
  (0, 859)	1
  (0, 99)	1
  (0, 486)	1
  (0, 470)	1
  (0, 525)	1
  (0, 617)	1
  (0, 37)	1
  (0, 30)	1
  (0, 891)	1
  (0, 235)	1
  (0, 137)	1
  (0, 160)	1
  (0, 584)	1
  (0, 403)	2
  (0, 400)	1
  (0, 858)	2
  (0, 18)	2
  (0, 357)	2
  (0, 782)	1
  (0, 421)	1
  (0, 6)	2
  (0, 480)	1
  (0, 369)	1
  :	:
  (0, 97)	1
  (0, 857)	1
  (0, 365)	1
  (0, 608)	1
  (0, 549)	2
  (0, 734)	2
  (0, 721)	2
  (0, 591)	2
  (0, 107)	4
  (0, 84)	1
  (0, 429)	1
  (0, 602)	3
  (0, 394)	1
  (0, 257)	2
  (0, 1016)	3
  (0, 1052)	3
  (0, 963)	4
  (0, 469)	2
  (0, 600)	10
  (0, 566)	3
  (0, 610)	1
  (0, 803)	2
  (0, 535)	6
  (0, 98)	2
  (0, 860)	9


#### O vetorizador também pode ser utilizado para transformar um texto não visto

In [7]:
texto_nao_visto = count_vectorizer.transform(['New patente document']).toarray()
print(texto_nao_visto)

[[0 0 0 ... 0 0 0]]


#### A partir daí temos

In [8]:
print("Formato Matriz sparsa gerada (numdocs,features) ==> ", X_train_counts.shape)
print("Representacao de um documento (o 6º) ==> ", X_train_counts.toarray()[5])
print("Indice de uma palavra ('applicable') ==>", count_vectorizer.vocabulary_.get('applicable'))
print('Dicionario: %s' %count_vectorizer.get_feature_names())


Formato Matriz sparsa gerada (numdocs,features) ==>  (17, 1015)
Representacao de um documento (o 6º) ==>  [0 0 0 ... 0 0 0]
Indice de uma palavra ('applicable') ==> 52
Dicionario: ['ability', 'able', 'accelerating', 'accompanying', 'accomplished', 'accordance', 'according', 'achievable', 'achieve', 'achieved', 'achieving', 'across', 'action', 'active', 'actuated', 'addition', 'address', 'adequately', 'adhere', 'adhering', 'adhesion', 'adopted', 'advance', 'advanced', 'advantage', 'affect', 'aforementioned', 'agricultural', 'aim', 'aimed', 'air', 'albeit', 'algorithm', 'align', 'allows', 'almost', 'along', 'also', 'alternate', 'although', 'always', 'among', 'amongst', 'amount', 'amp', 'amplified', 'amplifier', 'amplitude', 'analog', 'andw', 'another', 'apparatus', 'applicable', 'application', 'applied', 'appreciated', 'area', 'arecircumferentially', 'arithmetic', 'around', 'arrangement', 'art', 'aspect', 'assembly', 'associated', 'atmosphere', 'attempt', 'attenuated', 'attractive', 'aud

#### Explorando as contagens (palavras mais frequentes no texto)

In [9]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_words = X_train_counts.sum(axis=0)    #bag_of_words
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [10]:
words_freq

[('invention', 61),
 ('particle', 54),
 ('signal', 52),
 ('data', 44),
 ('mean', 28),
 ('water', 28),
 ('compression', 27),
 ('separation', 25),
 ('relates', 21),
 ('method', 21),
 ('present', 21),
 ('digital', 21),
 ('material', 19),
 ('sand', 19),
 ('length', 18),
 ('ha', 17),
 ('object', 17),
 ('communication', 16),
 ('amplifier', 16),
 ('apparatus', 15),
 ('size', 15),
 ('system', 14),
 ('output', 14),
 ('run', 14),
 ('used', 13),
 ('first', 13),
 ('fluidized', 13),
 ('audio', 13),
 ('variable', 13),
 ('heavier', 12),
 ('provide', 12),
 ('particularly', 12),
 ('video', 12),
 ('time', 11),
 ('bed', 11),
 ('density', 11),
 ('analog', 11),
 ('transmission', 11),
 ('encoded', 11),
 ('according', 10),
 ('mixture', 10),
 ('medium', 10),
 ('background', 10),
 ('field', 10),
 ('sieve', 10),
 ('switching', 10),
 ('also', 9),
 ('separate', 9),
 ('using', 9),
 ('said', 9),
 ('device', 9),
 ('fluid', 9),
 ('chamber', 9),
 ('power', 9),
 ('mode', 9),
 ('decoder', 9),
 ('mpeg', 9),
 ('type', 8),

### Define um vetorizador do tipo contagem de frequência de bi-gramas (na tentativa de juntar palavras que aparecem sempre juntas)


In [11]:
#http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import CountVectorizer

bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), tokenizer=LemmaTokenizer(), stop_words='english', min_df=1)

In [12]:
corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])
X_train_bigram_counts = bigram_vectorizer.fit_transform(corpus) # resulta em uma matriz sparsa numpy
print('Dicionario: %s' %bigram_vectorizer.get_feature_names())

Dicionario: ['ability provide', 'able used', 'accelerating particularly', 'accompanying drawing', 'accomplished conveyor', 'accordance invention', 'accordance moving', 'accordance particle', 'accordance present', 'accordance various', 'according density', 'according embodiment', 'according invention', 'according present', 'according previously', 'according settling', 'according size', 'achievable traditional', 'achieve attempt', 'achieve best', 'achieve efficient', 'achieve separation', 'achieve spatial', 'achieve temporal', 'achieve using', 'achieved furthermore', 'achieved mean', 'achieved relatively', 'achieving algorithm', 'action ragging', 'active doe', 'actuated respective', 'addition invention', 'addition switch', 'address deficiency', 'adequately end', 'adhere difficult', 'adhere particle', 'adhering sand', 'adhering water', 'adhesion large', 'adopted people', 'advance increasing', 'advance portable', 'advanced mobile', 'advantage come', 'advantage motion', 'advantage transmiss

#### Explorando as contagens (palavras mais frequentes no texto)

In [13]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_bigrams = X_train_bigram_counts.sum(axis=0)    #bag_of_words
bigram_freq = [(bigram, sum_bigrams[0, idx]) for bigram, idx in bigram_vectorizer.vocabulary_.items()]
bigram_freq =sorted(bigram_freq, key = lambda x: x[1], reverse=True)

In [14]:
bigram_freq

[('data compression', 22),
 ('invention relates', 21),
 ('present invention', 17),
 ('fluidized bed', 10),
 ('run length', 10),
 ('fluid medium', 8),
 ('field invention', 8),
 ('background invention', 7),
 ('audio signal', 7),
 ('analog signal', 7),
 ('digital signal', 7),
 ('variable length', 7),
 ('hutch chamber', 6),
 ('invention present', 6),
 ('power amplifier', 6),
 ('output signal', 6),
 ('switching mean', 6),
 ('variable run', 6),
 ('length encoded', 6),
 ('encoded data', 6),
 ('conveyor belt', 5),
 ('mixture particle', 5),
 ('control signal', 5),
 ('information signal', 5),
 ('digital video', 5),
 ('separation table', 4),
 ('sea sand', 4),
 ('invention invention', 4),
 ('axial region', 4),
 ('lighter particle', 4),
 ('heavier particle', 4),
 ('relates generally', 4),
 ('particle according', 4),
 ('size density', 4),
 ('compression parameter', 4),
 ('communication link', 4),
 ('compression expansion', 4),
 ('length decoder', 4),
 ('length decoding', 4),
 ('run value', 4),
 ('va

### Define um vetorizador do tipo TFIDF

In [15]:
# http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', min_df=1)

#### Cria um vetorizador já ajustado ao texto

In [16]:
import pandas as pd
import pickle

corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])
classes = pd.read_csv('toy.csv',encoding='utf8')['subgroup'].values.tolist()

## NOS OUTROS EXEMPLOS ESSAS DUAS ACOES (FIT E TRANSFORM) SAO FEITAS EM UMA SO, ATRAVES DO FIT_TRANSFORM ..
##   FIZ A SEPARACAO AQUI PARA PODER GRAVAR NO DISCO O VETORIZADOR E PODER USA-LO PARA VETORIZAR TEXTOS NOVOS DEPOIS
tfidf_vectorizer = tfidf_vectorizer.fit(corpus,classes) # treina o vetorizador
X_train_tfidf = tfidf_vectorizer.transform(corpus) # transforma os textos em uma matriz sparsa numpy

# gravando no disco vetorizador e a matriz vetorizada
pickle.dump(tfidf_vectorizer, open("tfidf_vectorizer.pickle", "wb"))
pickle.dump(X_train_tfidf, open("X_train_tfidf.pickle", "wb"))

In [17]:
print("Formato Matriz sparsa gerada (numdocs,features) ==> ", X_train_tfidf.shape)
print("Indice de uma palavra ==>", tfidf_vectorizer.vocabulary_.get('applicable'))


Formato Matriz sparsa gerada (numdocs,features) ==>  (17, 943)
Indice de uma palavra ==> 42


In [18]:
print("Representacao de um documento ==> ", X_train_tfidf.toarray()[5])

Representacao de um documento ==>  [0.         0.         0.         0.         0.         0.03991597
 0.         0.         0.036984   0.04762799 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.1824353  0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.05309975 0.         0.         0.
 0.         0.05309975 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.06081177 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0

In [19]:
print('Dicionario: %s' %tfidf_vectorizer.get_feature_names())

Dicionario: ['ability', 'able', 'accelerating', 'accompanying', 'accomplished', 'accordance', 'according', 'achievable', 'achieve', 'achieved', 'achieving', 'action', 'active', 'actuated', 'addition', 'address', 'adequately', 'adhere', 'adhering', 'adhesion', 'adopted', 'advance', 'advanced', 'advantage', 'affect', 'aforementioned', 'agricultural', 'aim', 'aimed', 'air', 'albeit', 'algorithm', 'align', 'allows', 'alternate', 'amp', 'amplified', 'amplifier', 'amplitude', 'analog', 'andw', 'apparatus', 'applicable', 'application', 'applied', 'appreciated', 'area', 'arecircumferentially', 'arithmetic', 'arrangement', 'art', 'aspect', 'assembly', 'associated', 'atmosphere', 'attempt', 'attenuated', 'attractive', 'audible', 'audio', 'auto', 'autocalibrating', 'axial', 'axis', 'background', 'baffle', 'band', 'bandwidth', 'barrier', 'base', 'baseband', 'based', 'basis', 'battery', 'bean', 'bed', 'belt', 'beneficiation', 'benefit', 'best', 'bias', 'bin', 'bit', 'blinded', 'block', 'board', 'bo

#### Explorando os dados (ordenando as palavras com maior tfidf nos documentos)

In [20]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_idfs = X_train_tfidf.sum(axis=0)    #tfidfs
words_idfs = [(word, sum_idfs[0, idx]) for word, idx in tfidf_vectorizer.vocabulary_.items()]
words_idfs =sorted(words_idfs, key = lambda x: x[1], reverse=True)

In [21]:
words_idfs

[('particle', 2.169805797865955),
 ('signal', 1.6817003706033522),
 ('data', 1.2889835585706229),
 ('water', 1.115427482004786),
 ('invention', 1.0121411687545774),
 ('compression', 0.9706513675022193),
 ('separation', 0.9623513964977484),
 ('sand', 0.8412236138996114),
 ('mean', 0.831698451076702),
 ('digital', 0.7704264859657949),
 ('amplifier', 0.6901226693160083),
 ('material', 0.6855520705391883),
 ('size', 0.6355129865019954),
 ('communication', 0.6116426477244803),
 ('length', 0.5781633508521442),
 ('fluidized', 0.5459952761644483),
 ('audio', 0.5458318841173877),
 ('output', 0.5383337523086837),
 ('object', 0.5256523947926989),
 ('method', 0.5255775078333834),
 ('bed', 0.5244706957034525),
 ('run', 0.5241528416396466),
 ('sieve', 0.5115231595558617),
 ('analog', 0.5084148565807611),
 ('heavier', 0.5035971671408396),
 ('video', 0.4984932695867639),
 ('present', 0.47982582563144166),
 ('density', 0.4789738438761441),
 ('variable', 0.47218734915463856),
 ('medium', 0.4695388196669

#### Montando vetorizador tfidf para bigramas

In [22]:
# http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_bigram_vectorizer = TfidfVectorizer(ngram_range=(2, 2), tokenizer=LemmaTokenizer(), stop_words='english') 


In [23]:
corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])
X_train_bigram_tfidf = tfidf_bigram_vectorizer.fit_transform(corpus) # resulta em uma matriz sparsa numpy

In [24]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_bigrams_tfidf = X_train_bigram_tfidf.sum(axis=0)    #bag_of_words
bigram_tfidf_freq = [(bigram, sum_bigrams_tfidf[0, idx]) for bigram, idx in tfidf_bigram_vectorizer.vocabulary_.items()]
bigram_tfidf_freq =sorted(bigram_tfidf_freq, key = lambda x: x[1], reverse=True)

In [25]:
bigram_tfidf_freq

[('data compression', 1.0029153208191701),
 ('fluidized bed', 0.592051974695919),
 ('present invention', 0.5608455428064585),
 ('fluid medium', 0.504717968456992),
 ('run length', 0.48283963144449693),
 ('invention relates', 0.4505300113967071),
 ('analog signal', 0.4499267319693905),
 ('digital signal', 0.4499267319693905),
 ('audio signal', 0.4239695262321941),
 ('power amplifier', 0.40076109477008126),
 ('hutch chamber', 0.3957551993426972),
 ('variable length', 0.3559016377011121),
 ('digital video', 0.3284245281945347),
 ('output signal', 0.32739374008841965),
 ('switching mean', 0.32564534139341594),
 ('mixture particle', 0.31544873028562),
 ('separation table', 0.30187226881284523),
 ('sea sand', 0.2980614132328945),
 ('field invention', 0.29683157890454775),
 ('conveyor belt', 0.2960259873479595),
 ('lighter particle', 0.29584157142954537),
 ('heavier particle', 0.29584157142954537),
 ('variable run', 0.2897037788666981),
 ('length encoded', 0.2897037788666981),
 ('encoded data

## Vetorização com o word2vec do Gensim

In [26]:
# utilizando um modelo word2vec previamente construido
import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-100")

In [27]:
#https://radimrehurek.com/gensim/models/word2vec.html
# representacao de uma palavra no formato word2vec = representado na dimensao 100 do modelo utilizado
print(word_vectors.get_vector('decontaminating'))

[-0.061385  0.12561  -0.56048   0.6487   -0.32806   0.043879 -0.13959
  0.26864   0.14186   0.4154   -0.10722  -0.26249   0.19023  -0.11717
  0.068604 -0.10567  -0.21894   0.30015   0.35531  -0.37637  -0.42679
 -0.15778   0.15872  -0.41037  -0.69795  -0.20353  -0.26681   0.17732
 -0.50042  -0.24031   0.07537  -0.61731   0.31157   0.046592  0.3206
 -0.46434  -0.28834  -0.7473   -0.33093   0.10785  -0.053039 -0.17338
  0.19379   0.37051   0.044132  0.46138  -0.23005  -0.093229 -0.44662
 -0.22174   0.075787  0.016844 -0.62861  -0.04128  -0.32616   0.85561
  0.20586   0.06525  -0.3154   -0.25991  -0.18751   0.088262  0.64952
  0.05752   0.082394  0.36783   0.27744  -0.07469  -0.43565  -0.46485
 -0.29916   0.10817   0.053593  0.74381   0.11528   0.47941   0.1998
  0.43718   0.3574    0.087149  0.20455   0.16908   0.030611  0.97797
  0.43876   0.19856   0.3481    0.30082  -0.76387   0.14109  -0.19316
  0.044695  0.221    -0.18177  -0.10391  -0.6672    0.099318  0.64056
  0.25906   0.36643 ]


In [28]:
## NAO UTILIZAR COM TOKENS STEMMIZADOS = PQ ELE NAO ACHARA MUITAS PALAVRAS
import numpy as np
class Word2VecVectorizer:
  def __init__(self,word_vectors):
    self.word_vectors = word_vectors

  def fit(self, lst_tokens):
    pass

  # para cada sentenca tokenizada ele representa cada palavra segundo a 
    #representacao w2v e depois tira a media de todas as palavras da sentenca
  def transform(self, lst_tokens): #pega uma lista de tokens
    self.D = word_vectors.get_vector(word_vectors.index2word[0]).shape[0]
    X = np.zeros((len(lst_tokens), self.D))
    n = 0
    emptycount = 0
    for tokens in lst_tokens:
      vecs = []
      m = 0
      for word in tokens:
        try:
            vec = self.word_vectors.get_vector(word)
            vecs.append(vec)
            m += 1
        except KeyError:
            print('Palavra ',word,' nao pode ser representada')
            pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    return X


  def fit_transform(self, lst_tokens):
    self.fit(lst_tokens)
    return self.transform(lst_tokens)

### Utilizando para vetorizar os textos

In [29]:
# usa o dictionario e o bowcorpus para recuperar os tokens
# lembrando que o bowcorpus nao devera ser construido com stems ..ele pode nao conseguir representar muitas palavras
dictionary = gensim.corpora.Dictionary.load("dictionary.dict") #carrega o dicionario do disco
idx2wordDictionary = {k:v for k, v in dictionary.iteritems()} #mudando a ordem dos indices

bowcorpus = gensim.corpora.MmCorpus('bowcorpus.mm') # le o corpus representado em bag-of-words
tokens = [[idx2wordDictionary[idx] for idx,freq in text] for text in bowcorpus]
print(tokens[0])

['accompanying', 'according', 'air', 'also', 'another', 'apparatus', 'applicable', 'arrangement', 'atmosphere', 'belt', 'case', 'closed', 'closely', 'contains', 'conveyor', 'course', 'cyclone', 'damaging', 'described', 'design', 'drawing', 'eliminate', 'embodiment', 'end', 'environment', 'equipment', 'essentially', 'fan', 'fed', 'feeding', 'flute', 'following', 'found', 'front', 'furthermore', 'granulate', 'heavier', 'heavy', 'horizontal', 'idea', 'instance', 'intended', 'invention', 'known', 'let', 'light', 'like', 'made', 'making', 'material', 'mean', 'mentioned', 'metal', 'method', 'mm', 'often', 'one', 'particularly', 'possible', 'preferred', 'previously', 'problem', 'provide', 'purpose', 'rear', 'reference', 'referring', 'relates', 'separate', 'separated', 'separating', 'separation', 'shall', 'shown', 'size', 'society', 'stone', 'substance', 'sucked', 'suitable', 'system', 'table', 'technique', 'today', 'type', 'unsatisfying', 'vibrator']


In [30]:
w2v_vectorizer_glove_wiki_gigaword_100 = Word2VecVectorizer(word_vectors)

X_train_w2v_glove_wiki_gigaword_100 = w2v_vectorizer_glove_wiki_gigaword_100.fit_transform(tokens)

#np.save('w2vmatrix.npy', textos_vetorizados) # salvando no disco

# gravando no disco vetorizador e a matriz vetorizada
pickle.dump(X_train_w2v_glove_wiki_gigaword_100, open("X_train_w2v_glove_wiki_gigaword_100.pickle", "wb"))



Palavra  granulate  nao pode ser representada
Palavra  knownas  nao pode ser representada
Palavra  andw  nao pode ser representada
Palavra  arecircumferentially  nao pode ser representada
Palavra  elutriators  nao pode ser representada
Palavra  fractionator  nao pode ser representada
Palavra  elutriators  nao pode ser representada
Palavra  crystalliser  nao pode ser representada
Palavra  maimoni  nao pode ser representada
Palavra  destoners  nao pode ser representada
Palavra  destoning  nao pode ser representada
Palavra  autocalibrating  nao pode ser representada
Palavra  intermods  nao pode ser representada
Palavra  linearizing  nao pode ser representada
Palavra  highfidelity  nao pode ser representada
Palavra  relatesto  nao pode ser representada
Palavra  seriaxo  nao pode ser representada
Palavra  macroblock  nao pode ser representada
Palavra  standardand  nao pode ser representada
Palavra  vlds  nao pode ser representada
Palavra  variablelength  nao pode ser representada


In [31]:
print(X_train_w2v_glove_wiki_gigaword_100[0]) #textos_vetorizados[0])

[-0.23921558  0.2493951   0.16395316  0.05970823  0.00532105  0.0370189
 -0.03392747  0.09929323  0.01442124  0.07689162  0.05401036 -0.18074922
  0.2329855   0.06702612  0.18081895 -0.15344752  0.08472463 -0.01564027
 -0.08325448 -0.06679938  0.08063343 -0.16584939  0.03957288 -0.0980863
  0.15577067 -0.07036316 -0.12082733 -0.24162252 -0.13095909  0.05381934
  0.03505604  0.2674365  -0.14877279 -0.09853061  0.15421015  0.08440775
  0.14060508 -0.01055216 -0.00754607 -0.14585456 -0.24540444 -0.20722479
  0.07392792 -0.1478743  -0.00953007 -0.05369928  0.12556256 -0.03529881
 -0.142076   -0.50880647  0.18693753  0.04949445  0.1444823   0.82280952
 -0.05083755 -1.62376773  0.0267164  -0.14820509  1.20019162  0.33381328
 -0.08156557  0.52314335 -0.02158269  0.19629997  0.69431192 -0.06532827
  0.22603481  0.04286725  0.14514609 -0.20213999 -0.06763455 -0.13415229
  0.25371772  0.02119819  0.17736264  0.02424286  0.04225962 -0.04253327
 -0.47848541  0.03005515  0.25102863 -0.02733909 -0.3

In [32]:
X_train_w2v_glove_wiki_gigaword_100.shape

(17, 100)

#### Transformando textos de teste

In [33]:
corpus_test = readCorpus("testtoy.csv",list_of_fields_to_read=['data'])

lem = LemmaTokenizer()

test_lem_tokens = [lem(texto) for texto in corpus_test]
print(test_lem_tokens[0])


['method', 'and', 'apparats', 'for', 'decontaminating', 'liquid', 'suspension', 'field', 'the', 'invention', 'this', 'invention', 'relates', 'general', 'the', 'decontamination', 'liquid', 'suspension', 'and', 'particular', 'the', 'decontamination', 'aqueous', 'paper', 'pulp', 'clay', 'slurry', 'background', 'the', 'invention', 'the', 'art', 'paper', 'manufacturing', 'decontamination', 'the', 'paper', 'pulp', 'primary', 'importance', 'achieve', 'consistent', 'paper', 'product', 'particular', 'recycling', 'waste', 'paper', 'requires', 'extensive', 'cleansing', 'the', 'aqueous', 'paper', 'pulp', 'remove', 'extraneous', 'contaminates', 'waste', 'paper', 'material', 'present', 'challenge', 'provide', 'economically', 'feasible', 'mean', 'recycling', 'which', 'yield', 'acceptable', 'paper', 'product', 'contaminates', 'may', 'grouped', 'into', 'one', 'three', 'class', 'first', 'elongated', 'flexible', 'material', 'such', 'piece', 'cord', 'fabric', 'and', 'wire', 'can', 'removed', 'from', 'the'

In [34]:
X_test_w2v_glove_wiki_gigaword_100 = w2v_vectorizer_glove_wiki_gigaword_100.fit_transform(test_lem_tokens)

# gravando no disco vetorizador e a matriz vetorizada
pickle.dump(X_test_w2v_glove_wiki_gigaword_100, open("X_test_w2v_glove_wiki_gigaword_100.pickle", "wb"))



Palavra  apparats  nao pode ser representada
Palavra  ragger  nao pode ser representada
Palavra  ragger  nao pode ser representada
Palavra  pulper  nao pode ser representada
Palavra  therethrough  nao pode ser representada
Palavra  pseudophasic  nao pode ser representada
Palavra  polyvinylpyrrolidone  nao pode ser representada
Palavra  interelectrode  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  databits  nao pode ser representada
Palavra  backaround  nao pode ser representada
Palavra  multibit  nao pode ser r