In [1]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path

Using TensorFlow backend.


In [2]:
# For reproducibility
# np.random.seed(1237)
 
# Source file directory
path_train = "20news-19997/20_newsgroups"
 
files_train = skds.load_files(path_train,load_content=False)
 
label_index = files_train.target
label_names = files_train.target_names
labelled_files = files_train.filenames
 
data_tags = ["filename","category","news"]
data_list = []
 
# Read and add data from file to a list
i=0
for f in labelled_files:
    data_list.append((f,label_names[label_index[i]],Path(f).read_text()))
    i += 1
 
# We have training data available as dictionary filename, category, data
data = pd.DataFrame.from_records(data_list, columns=data_tags)

In [3]:
# lets take 80% data as training and remaining 20% for test.
train_size = int(len(data) * .8)
 
train_posts = data['news'][:train_size]
train_tags = data['category'][:train_size]
train_files_names = data['filename'][:train_size]
 
test_posts = data['news'][train_size:]
test_tags = data['category'][train_size:]
test_files_names = data['filename'][train_size:]

In [4]:
# 20 news groups
num_labels = 20
vocab_size = 15000
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_posts)
 
x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf')
 
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [5]:

# encoder = LabelBinarizer()
# encoder.fit(train_tags)
# y_train = encoder.transform(train_tags)
# y_test = encoder.transform(test_tags)

# encoder = LabelBinarizer()
# encoder.fit(train_tags)
# y_train = encoder.transform(train_tags)
# y_test = encoder.transform(test_tags)

In [6]:
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))



model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=1,
                    verbose=1,
                    validation_split=0.1)

W0806 16:19:13.484101 10764 deprecation_wrapper.py:119] From C:\Users\coral\Anaconda3\envs\CV\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0806 16:19:14.407003 10764 deprecation_wrapper.py:119] From C:\Users\coral\Anaconda3\envs\CV\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0806 16:19:14.729438 10764 deprecation_wrapper.py:119] From C:\Users\coral\Anaconda3\envs\CV\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0806 16:19:14.982545 10764 deprecation_wrapper.py:119] From C:\Users\coral\Anaconda3\envs\CV\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0806 16:19:15

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               7680512   
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_2 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 20)                10260     
__________

W0806 16:19:16.147249 10764 deprecation.py:323] From C:\Users\coral\Anaconda3\envs\CV\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 14397 samples, validate on 1600 samples
Epoch 1/1


In [8]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test accuracy:', score[1])

text_labels = encoder.classes_

for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction[0])]
    print(test_files_names.iloc[i])
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label)

Test accuracy: 0.9325000047683716
20news-19997/20_newsgroups\alt.atheism\51181
Actual label:alt.atheism
Predicted label: alt.atheism
20news-19997/20_newsgroups\comp.graphics\38985
Actual label:comp.graphics
Predicted label: comp.graphics
20news-19997/20_newsgroups\talk.politics.guns\54710
Actual label:talk.politics.guns
Predicted label: talk.politics.guns
20news-19997/20_newsgroups\rec.sport.baseball\104743
Actual label:rec.sport.baseball
Predicted label: rec.sport.baseball
20news-19997/20_newsgroups\talk.politics.misc\179022
Actual label:talk.politics.misc
Predicted label: talk.politics.misc
20news-19997/20_newsgroups\sci.crypt\15238
Actual label:sci.crypt
Predicted label: sci.crypt
20news-19997/20_newsgroups\rec.sport.hockey\52649
Actual label:rec.sport.hockey
Predicted label: rec.sport.hockey
20news-19997/20_newsgroups\sci.space\60223
Actual label:sci.space
Predicted label: sci.space
20news-19997/20_newsgroups\misc.forsale\76449
Actual label:misc.forsale
Predicted label: misc.forsal

In [12]:
# creates a HDF5 file 'my_model.h5'
model.model.save('my_model.h5')
 
# Save Tokenizer i.e. Vocabulary
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
# load our saved model
from keras.models import load_model
model = load_model('my_model.h5')
 
# load tokenizer
tokenizer = Tokenizer()
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    
    
encoder.classes_ #LabelBinarizer

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'], dtype='<U24')

In [14]:
# These are the labels we stored from our training
# The order is very important here.
 
labels = np.array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x',
 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
 'rec.sport.football', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.technology',
 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast',
 'talk.politics.misc', 'talk.religion.misc'])
 
test_files = ["news.txt"]
x_data = []
for t_f in test_files:
    t_f_data = Path(t_f).read_text()
    x_data.append(t_f_data)
 
x_data_series = pd.Series(x_data)
x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf')
 
i=0
for x_t in x_tokenized:
    prediction = model.predict(np.array([x_t]))
    predicted_label = labels[np.argmax(prediction[0])]
    print("File ->", test_files[i], "Predicted label: " + predicted_label)

File -> news.txt Predicted label: sci.technology
