# Classification using NN

In [1]:
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
from keras_tqdm import TQDMNotebookCallback
from tensorflow.python.client import device_lib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn_evaluation.plot import confusion_matrix

from keras.utils.np_utils import to_categorical
import collections
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
import re
import string
#nltk.download('punkt')
#nltk.download('stopwords')

pd.set_option('display.max_colwidth',100)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data = pd.read_csv('News Aggregator/uci-news-aggregator.csv', usecols=['TITLE', 'CATEGORY','PUBLISHER'])

In [3]:
#M class has way less data than the orthers, thus the classes are unbalanced.
data.CATEGORY.value_counts()

e    152462
b    115964
t    108341
m     45639
Name: CATEGORY, dtype: int64

In [4]:
# remove duplicates
data = data.drop_duplicates()
data.isnull().sum()

TITLE        0
PUBLISHER    2
CATEGORY     0
dtype: int64

In [5]:
data['text'] = data.TITLE + " " + data.PUBLISHER
data.info()
#df.text.to_string()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 417046 entries, 0 to 422405
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   TITLE      417046 non-null  object
 1   PUBLISHER  417044 non-null  object
 2   CATEGORY   417046 non-null  object
 3   text       417044 non-null  object
dtypes: object(4)
memory usage: 15.9+ MB


In [6]:
# Drop null
data= data.dropna(subset=['text'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 417044 entries, 0 to 422405
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   TITLE      417044 non-null  object
 1   PUBLISHER  417044 non-null  object
 2   CATEGORY   417044 non-null  object
 3   text       417044 non-null  object
dtypes: object(4)
memory usage: 15.9+ MB


In [7]:
#I do aspire here to have balanced classes
num_of_categories = 45639
shuffled = data.reindex(np.random.permutation(data.index))
e = shuffled[shuffled['CATEGORY'] == 'e'][:num_of_categories]
b = shuffled[shuffled['CATEGORY'] == 'b'][:num_of_categories]
t = shuffled[shuffled['CATEGORY'] == 't'][:num_of_categories]
m = shuffled[shuffled['CATEGORY'] == 'm'][:num_of_categories]
concated = pd.concat([e,b,t,m], ignore_index=True)
#Shuffle the dataset
concated = concated.reindex(np.random.permutation(concated.index))
concated['LABEL'] = 0

#Converting categories to numbers
concated.loc[concated['CATEGORY'] == 'e', 'LABEL'] = 0
concated.loc[concated['CATEGORY'] == 'b', 'LABEL'] = 1
concated.loc[concated['CATEGORY'] == 't', 'LABEL'] = 2
concated.loc[concated['CATEGORY'] == 'm', 'LABEL'] = 3
print(concated['LABEL'][:10])
labels = to_categorical(concated['LABEL'], num_classes=4)
print(labels[:10])
if 'CATEGORY' in concated.keys():
    concated.drop(['CATEGORY'], axis=1)

48340     1
85649     1
174166    3
58833     1
68375     1
67398     1
89442     1
112976    2
81146     1
5798      0
Name: LABEL, dtype: int64
[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]


In [8]:
concated.head(5)

Unnamed: 0,TITLE,PUBLISHER,CATEGORY,text,LABEL
48340,Chrysler US sales increase at the fastest rate since 2005,Stixs News,b,Chrysler US sales increase at the fastest rate since 2005 Stixs News,1
85649,Fannie Mae to pay US Treasury $5.7 billion on quarterly profit,MSN Money,b,Fannie Mae to pay US Treasury $5.7 billion on quarterly profit MSN Money,1
174166,Local hospital prepares for MERS outbreak,LocalNews8.com,m,Local hospital prepares for MERS outbreak LocalNews8.com,3
58833,ECB Contemplates New Horizon,MarketPulse \(blog\),b,ECB Contemplates New Horizon MarketPulse \(blog\),1
68375,NYMEX crude prices dip in Asia with U.S. industry stocks data eyed,NASDAQ,b,NYMEX crude prices dip in Asia with U.S. industry stocks data eyed NASDAQ,1


In [9]:
symbols = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', 
           ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', 
           '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', 
           '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', 
           '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', 
           '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', 
           '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', 
           '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', 
           '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', 
           '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', 
           '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', 
           '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', 
           '¹', '≤', '‡', '√', ]

def clean_symbol(text):
    text = str(text)
    for symbol in symbols:
        text = text.replace(symbol, '')
    return text

# remove symbols and punctuations 
concated['text'] = concated['text'].apply(lambda x: clean_symbol(x))

In [10]:
# Dictionary of short form words and mispellings
short_forms_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                    "'cause": "because", "could've": "could have", "couldn't": "could not", 
                    "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                    "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", 
                    "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
                    "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                    "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", 
                    "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                    "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                    "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", 
                    "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                    "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", 
                    "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                    "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", 
                    "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", 
                    "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                    "she'll've": "she will have", "she's": "she is", "should've": "should have", 
                    "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                    "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", 
                    "that's": "that is", "there'd": "there would", "there'd've": "there would have", 
                    "there's": "there is", "here's": "here is","they'd": "they would", 
                    "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", 
                    "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", 
                    "we'd": "we would", "we'd've": "we would have", "we'll": "we will", 
                    "we'll've": "we will have", "we're": "we are", "we've": "we have", 
                    "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                    "what're": "what are",  "what's": "what is", "what've": "what have", 
                    "when's": "when is", "when've": "when have", "where'd": "where did", 
                    "where's": "where is", "where've": "where have", "who'll": "who will", 
                    "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", 
                    "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
                    "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                    "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
                    "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
                    "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
                    "you're": "you are", "you've": "you have"}

In [11]:
def clean_shortforms(text):
    clean_text = text
    for shortform in short_forms_dict.keys():
        if re.search(shortform, text):
            clean_text = re.sub(shortform, short_forms_dict[shortform], text)
    return clean_text

# fix short forms
concated['text'] = concated['text'].apply(lambda x: clean_shortforms(x))

In [12]:
def clean_text(s):
    s = re.sub("[0-9]+", "",s)
    s = re.sub(' +',' ', s)        
    return s

concated['text'] = [clean_text(s) for s in concated['text']]

In [13]:
concated['text'] = concated['text'].str.replace('[^\w\s]','')# unpunctuate

In [14]:
concated.head(5) 

Unnamed: 0,TITLE,PUBLISHER,CATEGORY,text,LABEL
48340,Chrysler US sales increase at the fastest rate since 2005,Stixs News,b,Chrysler US sales increase at the fastest rate since Stixs News,1
85649,Fannie Mae to pay US Treasury $5.7 billion on quarterly profit,MSN Money,b,Fannie Mae to pay US Treasury billion on quarterly profit MSN Money,1
174166,Local hospital prepares for MERS outbreak,LocalNews8.com,m,Local hospital prepares for MERS outbreak LocalNewscom,3
58833,ECB Contemplates New Horizon,MarketPulse \(blog\),b,ECB Contemplates New Horizon MarketPulse blog,1
68375,NYMEX crude prices dip in Asia with U.S. industry stocks data eyed,NASDAQ,b,NYMEX crude prices dip in Asia with US industry stocks data eyed NASDAQ,1


In [15]:
maxlen=0
word_freqs = collections.Counter()
num_recs = 0
stop_words = set(stopwords.words('english'))

for sentence in concated["text"]:
    words = nltk.word_tokenize(sentence.lower())
    word = [word for word in words if (len(word) >= 2 and len(word) < 14)]
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        if word in stop_words:
            continue;
        word = lemmatizer.lemmatize(word)
        word_freqs[word] += 1
    num_recs += 1

print("maxlen :", maxlen)
print("len(word_freqs) :", len(word_freqs))

maxlen : 23
len(word_freqs) : 43969


In [16]:
# integer encode text
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(concated["text"])

In [17]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 49387


In [18]:
MAX_VOCAB= 49319
MAX_TITLE_LENGTH = 23

vocab_size = min(MAX_VOCAB, len(word_freqs)) + 1#Somado 1 por causa do UNK
word2index = {x[0]: i+1 for i, x in enumerate(word_freqs.most_common(MAX_VOCAB))}
word2index["UNK"] = 0

In [19]:
X = []
y = []

i = 0

for sentence in concated["text"]:
    words = nltk.word_tokenize
    words = nltk.word_tokenize(sentence.lower())
    word = [word for word in words if (len(word) >= 2 and len(word) < 14)]
    seqs = []
    for word in words:
        if word in stop_words:
            continue
        word = lemmatizer.lemmatize(word)
        if word in word2index:
            seqs.append(word2index[word])
        else:
            seqs.append(word2index["UNK"])
    X.append(seqs)
    
for category in concated["LABEL"]:
    y.append(category)
X = sequence.pad_sequences(X, maxlen=MAX_TITLE_LENGTH)
y = to_categorical(y)

In [20]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, shuffle=True)

In [21]:
# Import the tools needed and use our previously defined functions to calculate precision and recall
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

In [23]:
EMBEDDING_SIZE= 100
#HIDDEN_LAYER_SIZE = 50
BATCH_SIZE = 256
NUM_EPOCH = 5

# Define Model
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=MAX_TITLE_LENGTH))
model.add(LSTM(128, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(128, activation='relu'))
model.add(Dense(4, activation='sigmoid'))
#model.add(Dense(4))
#model.add(Activation("softmax"))

model_adam = model
model_adam.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy', precision_m, recall_m])


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [24]:
history_adam = model_adam.fit(Xtrain, ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCH, 
                    validation_data=(Xtest, ytest), verbose=2, callbacks=[TQDMNotebookCallback()])

Instructions for updating:
Use tf.cast instead.
Train on 127387 samples, validate on 54595 samples


HBox(children=(FloatProgress(value=0.0, description='Training', max=5.0, style=ProgressStyle(description_width…

Epoch 1/5


HBox(children=(FloatProgress(value=0.0, description='Epoch 0', max=127387.0, style=ProgressStyle(description_w…

 - 140s - loss: 0.5219 - acc: 0.7984 - precision_m: 0.7950 - recall_m: 0.8219 - val_loss: 0.2188 - val_acc: 0.9268 - val_precision_m: 0.9316 - val_recall_m: 0.9233
Epoch 2/5


HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=127387.0, style=ProgressStyle(description_w…

 - 133s - loss: 0.2033 - acc: 0.9324 - precision_m: 0.9359 - recall_m: 0.9282 - val_loss: 0.1862 - val_acc: 0.9361 - val_precision_m: 0.9336 - val_recall_m: 0.9390
Epoch 3/5


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=127387.0, style=ProgressStyle(description_w…

 - 132s - loss: 0.1551 - acc: 0.9479 - precision_m: 0.9491 - recall_m: 0.9454 - val_loss: 0.1845 - val_acc: 0.9388 - val_precision_m: 0.9393 - val_recall_m: 0.9381
Epoch 4/5


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=127387.0, style=ProgressStyle(description_w…

 - 132s - loss: 0.1296 - acc: 0.9558 - precision_m: 0.9596 - recall_m: 0.9495 - val_loss: 0.1842 - val_acc: 0.9384 - val_precision_m: 0.9445 - val_recall_m: 0.9327
Epoch 5/5


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=127387.0, style=ProgressStyle(description_w…

 - 134s - loss: 0.1120 - acc: 0.9614 - precision_m: 0.9654 - recall_m: 0.9505 - val_loss: 0.1945 - val_acc: 0.9378 - val_precision_m: 0.9479 - val_recall_m: 0.9266



In [25]:
# Prediction using test data
predictions=model_adam.predict(Xtest)

In [26]:
# Prediction using train data
predictions1=model_adam.predict(Xtrain)

In [27]:
# Classification report
report = classification_report(ytest, predictions.round())
print(report)

              precision    recall  f1-score   support

           0       0.98      0.95      0.97     13721
           1       0.92      0.89      0.91     13767
           2       0.92      0.92      0.92     13664
           3       0.97      0.95      0.96     13443

   micro avg       0.95      0.93      0.94     54595
   macro avg       0.95      0.93      0.94     54595
weighted avg       0.95      0.93      0.94     54595
 samples avg       0.92      0.93      0.92     54595



  _warn_prf(average, modifier, msg_start, len(result))
