# Tidene Códigos - Vetorizadores

In [16]:
import gensim
import nltk
import numpy as np
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
nltk.download('punkt')
nltk.download('wordnet')

pd_corpus =  pd.read_csv('data/train_complete.csv',encoding='utf8')
pd_corpus , corpus_test =  train_test_split(pd_corpus, train_size=0.7)
classes_train = pd_corpus['sentiment']
corpus = BeautifulSoup(str(pd_corpus['review']), "html.parser").get_text()
corpus_test = corpus_test['review']



[nltk_data] Downloading package punkt to /home/bruno/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/bruno/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Classe que encapsula o tipo de tokenização e sequencia de limpeza a ser realizada nos textos

#### Utilitarios - lê corpus e tokenizadores

In [8]:

import csv

class readCorpus(object):
    def __init__(self,csvfile,list_of_fields_to_read=[],tokenizer=None,encoding='utf8'):
        self.csvfile = csvfile
        self.fields = list_of_fields_to_read
        self.tokenizer = tokenizer
        self.encoding = encoding
    
    def __iter__(self):
        f = open(self.csvfile,encoding=self.encoding, errors='ignore')
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) #separador dos campos\n",
        headers = next(reader, None)
        if (len(self.fields) <= 0):
            self.fields = headers
        selected_field_indexes = []
        for idx,field in enumerate(headers):
            if field in self.fields:
                selected_field_indexes.append(idx)

        for line in reader:
            if line:
                yield [line[idx] for idx in selected_field_indexes] if (len(selected_field_indexes)>1) else (line[selected_field_indexes[0]] if not self.tokenizer else tokenizer.tokenize(line[selected_field_indexes[0]]))
                        

In [9]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import *    #https://www.nltk.org/api/nltk.tokenize.html

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]

class StemTokenizer(object):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z']+")
    def __call__(self, doc):
        
        return [self.stemmer.stem(t) for t in self.tokenizer.tokenize(doc) if (len(t)>2)]


In [10]:
from nltk.corpus import stopwords
from nltk import download
download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/bruno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Define um vetorizador do tipo contagem de frequência (bag-of-words)

In [None]:
#http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words=stop_words) # usa a classe de tokenizacao definida acima


#### Passa os textos pelo vetorizador

In [None]:
#corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])

X_train_counts = count_vectorizer.fit_transform(corpus) # resulta em uma matriz sparsa numpy

In [None]:
print(X_train_counts[0])

#### O vetorizador também pode ser utilizado para transformar um texto não visto

In [None]:
texto_nao_visto = count_vectorizer.transform(['New film review'])
print(texto_nao_visto)

#### A partir daí temos

In [None]:
print("Formato Matriz sparsa gerada (numdocs,features) ==> ", X_train_counts.shape)
print("Representacao de um documento (o 6º) ==> ", X_train_counts[5])
print("Indice de uma palavra ('film') ==>", count_vectorizer.vocabulary_.get('film'))


#### Explorando as contagens (palavras mais frequentes no texto)

In [None]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_words = X_train_counts.sum(axis=0)    #bag_of_words
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [None]:
words_freq

### Define um vetorizador do tipo contagem de frequência de bi-gramas (na tentativa de juntar palavras que aparecem sempre juntas)


In [None]:
#http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import CountVectorizer

bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), tokenizer=LemmaTokenizer(), stop_words='english', min_df=1)

In [None]:
#corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])
X_train_bigram_counts = bigram_vectorizer.fit_transform(corpus) # resulta em uma matriz sparsa numpy
#print('Dicionario: %s' %bigram_vectorizer.get_feature_names())

#### Explorando as contagens (palavras mais frequentes no texto)

In [None]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_bigrams = X_train_bigram_counts.sum(axis=0)    #bag_of_words
bigram_freq = [(bigram, sum_bigrams[0, idx]) for bigram, idx in bigram_vectorizer.vocabulary_.items()]
bigram_freq =sorted(bigram_freq, key = lambda x: x[1], reverse=True)

In [None]:
bigram_freq

### Define um vetorizador do tipo TFIDF

In [12]:
# http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', min_df=1)

#### Cria um vetorizador já ajustado ao texto

In [13]:
import pandas as pd
import pickle

#corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])
classes = pd_corpus['sentiment'].values.tolist()

## NOS OUTROS EXEMPLOS ESSAS DUAS ACOES (FIT E TRANSFORM) SAO FEITAS EM UMA SO, ATRAVES DO FIT_TRANSFORM ..
##   FIZ A SEPARACAO AQUI PARA PODER GRAVAR NO DISCO O VETORIZADOR E PODER USA-LO PARA VETORIZAR TEXTOS NOVOS DEPOIS
tfidf_vectorizer = tfidf_vectorizer.fit(corpus,classes) # treina o vetorizador
X_train_tfidf = tfidf_vectorizer.transform(corpus) # transforma os textos em uma matriz sparsa numpy

# gravando no disco vetorizador e a matriz vetorizada
pickle.dump(tfidf_vectorizer, open("tfidf_vectorizer.pickle", "wb"))
pickle.dump(X_train_tfidf, open("X_train_tfidf.pickle", "wb"))

  sorted(inconsistent))


In [None]:
print("Formato Matriz sparsa gerada (numdocs,features) ==> ", X_train_tfidf.shape)
print("Indice de uma palavra ==>", tfidf_vectorizer.vocabulary_.get('film'))


In [None]:
print("Representacao de um documento ==> ", X_train_tfidf[5])

In [None]:
print('Dicionario: %s' %tfidf_vectorizer.get_feature_names())

#### Explorando os dados (ordenando as palavras com maior tfidf nos documentos)

In [None]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_idfs = X_train_tfidf.sum(axis=0)    #tfidfs
words_idfs = [(word, sum_idfs[0, idx]) for word, idx in tfidf_vectorizer.vocabulary_.items()]
words_idfs =sorted(words_idfs, key = lambda x: x[1], reverse=True)

In [None]:
words_idfs

#### Montando vetorizador tfidf para bigramas

In [None]:
# http://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_bigram_vectorizer = TfidfVectorizer(ngram_range=(2, 2), tokenizer=LemmaTokenizer(), stop_words='english') 


In [None]:
#corpus = readCorpus("toy.csv",list_of_fields_to_read=['data'])
X_train_bigram_tfidf = tfidf_bigram_vectorizer.fit_transform(corpus) # resulta em uma matriz sparsa numpy

In [None]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
sum_bigrams_tfidf = X_train_bigram_tfidf.sum(axis=0)    #bag_of_words
bigram_tfidf_freq = [(bigram, sum_bigrams_tfidf[0, idx]) for bigram, idx in tfidf_bigram_vectorizer.vocabulary_.items()]
bigram_tfidf_freq =sorted(bigram_tfidf_freq, key = lambda x: x[1], reverse=True)

In [None]:
bigram_tfidf_freq

## Vetorização com o word2vec do Gensim

In [None]:
# utilizando um modelo word2vec previamente construido
import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-100")

In [None]:
#https://radimrehurek.com/gensim/models/word2vec.html
# representacao de uma palavra no formato word2vec = representado na dimensao 100 do modelo utilizado
print(word_vectors.get_vector('film'))

In [None]:
## NAO UTILIZAR COM TOKENS STEMMIZADOS = PQ ELE NAO ACHARA MUITAS PALAVRAS
import numpy as np
class Word2VecVectorizer:
  def __init__(self,word_vectors):
    self.word_vectors = word_vectors

  def fit(self, lst_tokens):
    pass

  # para cada sentenca tokenizada ele representa cada palavra segundo a 
    #representacao w2v e depois tira a media de todas as palavras da sentenca
  def transform(self, lst_tokens): #pega uma lista de tokens
    self.D = word_vectors.get_vector(word_vectors.index2word[0]).shape[0]
    X = np.zeros((len(lst_tokens), self.D))
    n = 0
    emptycount = 0
    for tokens in lst_tokens:
      vecs = []
      m = 0
      for word in tokens:
        try:
            vec = self.word_vectors.get_vector(word)
            vecs.append(vec)
            m += 1
        except KeyError:
            #print('Palavra ',word,' nao pode ser representada')
            pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    return X


  def fit_transform(self, lst_tokens):
    self.fit(lst_tokens)
    return self.transform(lst_tokens)

### Utilizando vetorizar os textos

In [None]:
# usa o dictionario e o bowcorpus para recuperar os tokens
# lembrando que o bowcorpus nao devera ser construido com stems ..ele pode nao conseguir representar muitas palavras
dictionary = gensim.corpora.Dictionary.load("dictionary.dict") #carrega o dicionario do disco
idx2wordDictionary = {k:v for k, v in dictionary.iteritems()} #mudando a ordem dos indices
lem = LemmaTokenizer()
bowcorpus = gensim.corpora.MmCorpus('bowcorpus.mm') # le o corpus representado em bag-of-words
tokens = [[idx2wordDictionary[idx] for idx,freq in text] for text in bowcorpus]
print(tokens[0])

In [None]:
w2v_vectorizer_glove_wiki_gigaword_100 = Word2VecVectorizer(word_vectors)

X_train_w2v_glove_wiki_gigaword_100 = w2v_vectorizer_glove_wiki_gigaword_100.fit_transform(tokens)

#np.save('w2vmatrix.npy', textos_vetorizados) # salvando no disco

# gravando no disco vetorizador e a matriz vetorizada
pickle.dump(X_train_w2v_glove_wiki_gigaword_100, open("X_train_w2v_glove_wiki_gigaword_100.pickle", "wb"))



In [None]:
print(X_train_w2v_glove_wiki_gigaword_100[0]) #textos_vetorizados[0])

In [None]:
X_train_w2v_glove_wiki_gigaword_100.shape

#### Transformando textos de teste

In [None]:
#corpus_test = readCorpus("testtoy.csv",list_of_fields_to_read=['data'])

lem = LemmaTokenizer()

test_lem_tokens = [lem(texto) for texto in corpus_test]
print(test_lem_tokens[0])


In [None]:
X_test_w2v_glove_wiki_gigaword_100 = w2v_vectorizer_glove_wiki_gigaword_100.fit_transform(test_lem_tokens)

# gravando no disco vetorizador e a matriz vetorizada
pickle.dump(X_test_w2v_glove_wiki_gigaword_100, open("X_test_w2v_glove_wiki_gigaword_100.pickle", "wb"))

