In [9]:
import pandas as pd
import numpy as np
import os
from keras.utils import get_file

dataset_archive_file = "NewsAggregatorDataset.zip"
data_archive_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00359/" + dataset_archive_file
datafile = "newsCorpora.csv"

download = get_file(fname = datasetArchiveFile,
                    origin = url,
                    extract = True)

dataset = pd.read_csv(os.path.join(os.path.dirname(download), datafile), 
                      sep='\t', 
                      names=['TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

In [None]:
# Pick our feature set (X) and what we are trying to predict (y)
X = dataset.TITLE
y = dataset.CATEGORY

In [None]:
# Split the data randomly for validation and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.05)

In [None]:
# Create a vectorizer for the healines
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

vectorizer = TfidfVectorizer(max_df=0.5, sublinear_tf=True, stop_words='english')
vectorizer.fit(dataset.TITLE)

# Save the vectorizer for use in loading saved model
joblib.dump(vectorizer, 'vectorizer.pkl')

In [None]:
# Create a helper function perform preprocessing on data
def process_data(X, y, vectorizer):
    y = pd.get_dummies(y)
    X = vectorizer.transform(X)
    return X, y

# Process the training data
X_train.to_csv('X_train.csv')
y_train.to_csv('y_train.csv')
X_train, y_train = process_data(X_train, y_train, vectorizer)

# Process the validation data
X_val.to_csv('X_val.csv')
y_val.to_csv('y_val.csv')
X_val, y_val = process_data(X_val, y_val, vectorizer)

# Process the test data for validation
X_test.to_csv('X_test.csv')
y_test.to_csv('y_test.csv')
X_test, y_test = process_data(X_test, y_test, vectorizer)

<center>
<h1>Artificial Neural Network Architecture</h1>
<img src="http://neuralnetworksanddeeplearning.com/images/tikz11.png"/>
</center>
<footer>image source http://neuralnetworksanddeeplearning.com</footer>

In [None]:
# Create the ANN
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

classifier = Sequential()


classifier.add(Dense(units = 64, activation='relu', 
                     kernel_initializer = 'uniform',
                     input_shape = (54345,)))


classifier.add(Dense(units = 32, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = .3))

classifier.add(Dense(units =32, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = .3))

classifier.add(Dense(units =32, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = .2))

classifier.add(Dense(units =32, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = .2))


# Output Layer. Four output nodes for our four classification types of news headlines.
classifier.add(Dense(units = 4, activation='softmax', 
                     kernel_initializer='uniform'))

classifier.compile(optimizer = 'rmsprop', loss ='categorical_crossentropy', metrics=['accuracy'])


In [None]:
from keras import callbacks

terminateOnNanCallback = callbacks.TerminateOnNaN()
tbCallback = callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True)
earlyStopCallback = callbacks.EarlyStopping(monitor='val_acc', patience=3)



In [None]:
# Fit the ANN to the training data
history = classifier.fit(X_train, y_train, validation_data = (X_val, y_val), 
                         callbacks = [tbCallback, terminateOnNanCallback, earlyStopCallback], 
                         epochs=100, batch_size=64)

In [None]:
# Validate the ANN
scores = classifier.evaluate(X_test, y_test)
print("Model accuracy on test data = {} ".format(scores[1]))

In [None]:
# Create files to store model configuration and weights
# such that the model can be built back up from the files
# rather than having to retrain.
# Thanks to https://machinelearningmastery.com/save-load-keras-deep-learning-models/
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config_filename = "model_config_" + timestr + ".json"
weights_filename = "model_weights_" + timestr + ".h5"

# Serialize model to JSON
classifier_json = classifier.to_json()
with open(config_filename, "w") as json_file:
    json_file.write(classifier_json)

# Serialize weights to HDF5
classifier.save_weights(weights_filename)