In [63]:
# INDT Developer test
# Author: Wellington Noberto da Silva Araujo

In [1]:
import numpy as np
import pandas as pd
import os
# NLP
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Wellington\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Wellington\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read data

In [28]:
df = pd.read_csv('Desafio-ML.csv')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [30]:
# Removing duplicate rows
initial_number = df.shape[0]
df = df.drop_duplicates(subset='text')
final_number = df.shape[0]
print('{} duplicate rows removed'.format(initial_number - final_number))

99 duplicate rows removed


In [31]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [32]:
# Number of classes
num_classes = df.category.nunique()
# Class names
class_names = df.category.unique()
# Checking if the classes are balanced
df.category.value_counts()

sport            504
business         503
politics         403
entertainment    369
tech             347
Name: category, dtype: int64

### Preprocessing data

In [33]:
def process_text(text):
    # remove ponctuations, special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # lower case the text
    text = text.lower()
    # create tokens, remove stop words and apply stemming
    text = [stemmer.stem(word) for word in word_tokenize(text) if not word in set(stopwords.words('english'))]
    
    return text    

In [34]:
df['tokens'] = df.text.apply(process_text)

### Encoding tokens

In [35]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [36]:
# Number of tokens in a text
max_lenght = 50
# List of tokens
text_lists = df.tokens.values

In [38]:
# One-hot encode each token
tokenizer = Tokenizer(num_words= None)
tokenizer.fit_on_texts(text_lists)
# Dictionary of encoded tokens
word_index = tokenizer.word_index
# Get arrays of encoded tokens
text_sequences = tokenizer.texts_to_sequences(text_lists)

In [44]:
# Number of words
vocab_size = len(tokenizer.word_index) + 1

In [41]:
# Padding sequences
text_padded = pad_sequences(text_sequences, maxlen= max_lenght, padding= 'post', truncating= 'post')

### Word Embedding

In [42]:
# Reading GloVe Embedding
embeddings_dict = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_dict[word] = coefs

In [45]:
# Creating an embedded matrix using only tokens used in our dataset
max_embeddings = 100
embedding_matrix = np.zeros((vocab_size, max_embeddings))
for word, i in word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Split train and test

In [46]:
from sklearn.preprocessing import OneHotEncoder

In [47]:
# One-hot encoding the output labels
one_hot_encoder = OneHotEncoder(sparse=False)
labels = df.category.values
# Reshaping for one_hot_encoder function
labels = np.reshape(labels, (len(labels), 1))
# Encoding labels
encoded_labels = one_hot_encoder.fit_transform(labels)

In [61]:
# Using a simple slicing method by selecting the first 70% of the data to the train and the rest for the test
train_size = int(0.7 * len(text_padded))
x_train, x_test = text_padded[:train_size], text_padded[train_size:]
y_train, y_test = encoded_labels[:train_size], encoded_labels[train_size:]

### Creating model

In [30]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.losses import CategoricalCrossentropy

In [31]:
model = Sequential()
model.add(Embedding(vocab_size, max_embeddings, weights=[embedding_matrix], input_length=max_lenght, trainable=False))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(5, activation= 'softmax'))

In [32]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Training

In [None]:
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))

Epoch 1/5


### Testing

In [None]:
# Predicting for test sequences 
pred = model.predict(x_test)
# Get the higher output score
pred_labels = np.argmax(pred, axis=-1)
# Decoding the output labels
y_pred = label_encoder.inverse_transform(pred_labels)

In [None]:
# Decoding the real values for testing 
y_true = label_encoder.inverse_transform(np.ravel(one_hot_encoder.inverse_transform(y_test)))

#### Visualizing results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Creating a confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
sns.heatmap(conf_matrix, cbar=False, cmap='Blues', xticklabels=class_names, yticklabels=class_names, annot=True, fmt='g')
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
# Printing a classification report
print(classification_report(y_true, y_pred))