In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv

### Tokenização

In [2]:
# Define as sentenças

sentences = [
    'eu amo meu chachorro',
    'eu não tenho gatos',
    'gosto de brincar, se divertir e viajar'
    ]

In [3]:
# Inicializa a classe tokenizer
tokenizer = Tokenizer(num_words = 100)

# Gera os indices para as sentenças
tokenizer.fit_on_texts(sentences)

# Printa os indices de cada palavra
word_index = tokenizer.word_index
print(word_index)

{'eu': 1, 'amo': 2, 'meu': 3, 'chachorro': 4, 'não': 5, 'tenho': 6, 'gatos': 7, 'gosto': 8, 'de': 9, 'brincar': 10, 'se': 11, 'divertir': 12, 'e': 13, 'viajar': 14}


### Texto para sequências

In [4]:
sentences = [
    'eu amo meu chachorro',
    'eu não tenho gatos',
    'gosto de brincar, se divertir e viajar',
    'Você acha que meu cachorro é bonito'
    ]

In [5]:
# Inicializa a classe tokenizer
tokenizer = Tokenizer(num_words = 100, oov_token="") # oov_token cria um valor default para palavras não encontradas no indice

# Gera os indices para as sentenças
tokenizer.fit_on_texts(sentences)

# Printa os indices de cada palavra
word_index = tokenizer.word_index

# Gera uma lista de sequências de tokens
sequences = tokenizer.texts_to_sequences(sentences)

# Resultado
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)


Word Index =  {'': 1, 'eu': 2, 'meu': 3, 'amo': 4, 'chachorro': 5, 'não': 6, 'tenho': 7, 'gatos': 8, 'gosto': 9, 'de': 10, 'brincar': 11, 'se': 12, 'divertir': 13, 'e': 14, 'viajar': 15, 'você': 16, 'acha': 17, 'que': 18, 'cachorro': 19, 'é': 20, 'bonito': 21}

Sequences =  [[2, 4, 3, 5], [2, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 3, 19, 20, 21]]


### Padding

In [6]:
# Converte sequencias para tamanhos uniformes

padded = pad_sequences(sequences, maxlen=7)

# Resultado
print(padded)

[[ 0  0  0  2  4  3  5]
 [ 0  0  0  2  6  7  8]
 [ 9 10 11 12 13 14 15]
 [16 17 18  3 19 20 21]]


### Exploração e processamento do arquivo BBC news

In [7]:
#Leitura do arquivo
dados = pd.read_csv('dados/bbc-text.csv', sep=',')
dados.head(3)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...


In [11]:
dados['category'].values[0]

'tech'

In [10]:
dados['text'].values[0]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are also being built-in to high

In [12]:
#Remove stop words

def remove_stopwords(sentence):
   
    # Lista de stop words
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
    
    # Transforma sentença para lower case
    sentence = sentence.lower()
    
    # Tokeniza a palavras e remove as palavras da lista
    tokens = [word for word in sentence.split() if word not in stopwords]
    
    # Concatena os tokens
    sentence = " ".join(tokens)
    
    return sentence

In [13]:
# Teste da função

remove_stopwords("I am about to go to the store and get any snack")

'go store get snack'

In [20]:
# Leitura do arquivo e retirada das stop words

def leitura_arquivo(filename):
    
    # Cria lista de sentenças e labels vazia
    sentences = []
    labels = []
    
    #Leitura do arquivo
    with open(filename, 'r') as csvfile:
        
        #Leitura do arquivo csv
        reader = csv.reader(csvfile, delimiter=',')
        
        # Pula a primeira linha
        next(reader)
    
        # loop no arquivo para coleta dos dados
        for row in reader:
            
            # Anexa label
            labels.append(row[0])
            
            # Coleta primeira linha
            sentence = row[1]
            
            # Remove stop Words
            sentence = remove_stopwords(sentence)
            
            # Insere texto tradado na lista
            sentences.append(sentence)
    
    return sentences, labels

In [22]:
# Testa a função

# Define diretório do arquivo
filename = 'dados/bbc-text.csv'

# Leitura do arquivo csv
sentences, labels = leitura_arquivo(filename)

print(f'Quantidade de sentenças no dataset: {len(sentences)} \n')

print(f'Quantidade de palavras na primeira sentença após retirada das stop words: {len(sentences[0].split())} \n')

print(f'Quantidade de labels distintos: {len(set(labels))}')

Quantidade de sentenças no dataset: 2225 

Quantidade de palavras na primeira sentença após retirada das stop words: 436 

Quantidade de labels distintos: 5


In [23]:
# Tokenização de sentenças

def fit_tokenizer(sentences):
    
    # Cria objeto e define parâmetros de tokenização
    tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
    
    # Tokeniza as sentenças
    tokenizer.fit_on_texts(sentences)
    
    return tokenizer

In [24]:
# Testa a função

tokenizer = fit_tokenizer(sentences)
word_index = tokenizer.word_index

print(f"Vocabulário contém {len(word_index)} palavras\n")
print("<OOV> token incluido no vocabulário" if "<OOV>" in word_index else "<OOV> token não incluido no vocabulário")

Vocabulário contém 29714 palavras

<OOV> token incluido no vocabulário


In [25]:
# Padded de sentenças (Todas as sentenças terão o mesmo comprimento)

def padded_sequences(tokenizer, sentences):
    
    # Converte as sentenças para sequências 
    sequences = tokenizer.texts_to_sequences(sentences)
    
    # Pad sequências usando o método post padding
    padded_sequences = pad_sequences(sequences, padding='post', truncating='post')
    
    return padded_sequences

In [27]:
# Testa a função

padded_sequences = padded_sequences(tokenizer, sentences)
print(f"Primeira padded sequence: \n\n{padded_sequences[0]}\n")

Primeira padded sequence: 

[96  1  1 ...  0  0  0]



In [28]:
# Tokeniza labels

def tokenize_labels(labels):
    
    # Cria objeto
    tokenizer = Tokenizer()
        
    # Fit tokenizer com os labels
    label_tokenizer = tokenizer.fit_on_texts(labels)
    
    # Salva o word index
    label_word_index = tokenizer.word_index
    
    # Salva a sequência
    label_sequences = tokenizer.texts_to_sequences(labels)
    
    return label_sequences, label_word_index

In [29]:
# Testa a função

label_sequences, label_word_index = tokenize_labels(labels)
print(f"Vocabulário de labels {label_word_index}\n")
print(f"Primeiras sequências {label_sequences[:10]}\n")

Vocabulário de labels {'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}

Primeiras sequências [[4], [2], [1], [1], [5], [3], [3], [1], [1], [5]]

