In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from zipfile import ZipFile
import urllib.request


datafile = Path("./newsCorpora.csv")
datazipfile = Path("./NewsAggregatorDataset.zip")
urlstring = "http://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip"

if not datafile.exists():
    if not datazipfile.exists():
        datazipfile,_ = urllib.request.urlretrieve(urlstring)
        print(datazipfile)
    with ZipFile(datazipfile, 'r') as zip:
            zip.extractall()
        


dataset = pd.read_csv(datafile, sep='\t', names=['TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

In [2]:
# Pick our feature set (X) and what we are trying to predict (y)
X = dataset.TITLE
y = dataset.CATEGORY

In [3]:
# Split the data randomly for validation and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.05)

In [4]:
# Create a vectorizer for the healines
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib

vectorizer = TfidfVectorizer(max_df=0.5, sublinear_tf=True, stop_words='english')
vectorizer.fit(dataset.TITLE)

# Save the vectorizer for use in loading saved model
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [5]:
# Create a helper function perform preprocessing on data
def process_data(X, y, vectorizer):
    y = pd.get_dummies(y)
    X = vectorizer.transform(X)
    return X, y

# Process the training data
X_train.to_csv('X_train.csv')
y_train.to_csv('y_train.csv')
X_train, y_train = process_data(X_train, y_train, vectorizer)

# Process the validation data
X_val.to_csv('X_val.csv')
y_val.to_csv('y_val.csv')
X_val, y_val = process_data(X_val, y_val, vectorizer)

# Process the test data for validation
X_test.to_csv('X_test.csv')
y_test.to_csv('y_test.csv')
X_test, y_test = process_data(X_test, y_test, vectorizer)

<center>
<h1>Artificial Neural Network Architecture</h1>
<img src="http://neuralnetworksanddeeplearning.com/images/tikz11.png"/>
</center>

In [6]:
# Create the ANN
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

classifier = Sequential()

# Input layer
classifier.add(Dense(units = 64, activation='relu', 
                     kernel_initializer = 'uniform',
                     input_shape = (54345,)))

# Eight Hidden Layers
classifier.add(Dense(units = 32, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = .3))

classifier.add(Dense(units =32, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = .3))

classifier.add(Dense(units =32, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = .2))

classifier.add(Dense(units =32, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = .2))


# Output Layer. Four output nodes for our four classification types of news headlines.
classifier.add(Dense(units = 4, activation='softmax', 
                     kernel_initializer='uniform'))

classifier.compile(optimizer = 'rmsprop', loss ='categorical_crossentropy', metrics=['accuracy'])


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
from keras import callbacks

terminateOnNanCallback = callbacks.TerminateOnNaN()
tbCallback = callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True)
earlyStopCallback = callbacks.EarlyStopping(monitor='val_acc', patience=3)



Instructions for updating:
Use the retry module or similar alternatives.


In [8]:
# Fit the ANN to the training data
history = classifier.fit(X_train, y_train, validation_data = (X_val, y_val), 
                         callbacks = [tbCallback, terminateOnNanCallback, earlyStopCallback], 
                         epochs=100, batch_size=64)

Train on 380177 samples, validate on 2113 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


In [9]:
# Validate the ANN
scores = classifier.evaluate(X_test, y_test)
print("Model accuracy on test data = {} ".format(scores[1]))

Model accuracy on test data = 0.9422861272396521 


In [10]:
# Create files to store model configuration and weights
# such that the model can be built back up from the files
# rather than having to retrain.
# Thanks to https://machinelearningmastery.com/save-load-keras-deep-learning-models/
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config_filename = "model_config_" + timestr + ".json"
weights_filename = "model_weights_" + timestr + ".h5"

# Serialize model to JSON
classifier_json = classifier.to_json()
with open(config_filename, "w") as json_file:
    json_file.write(classifier_json)

# Serialize weights to HDF5
classifier.save_weights(weights_filename)