In [18]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('uci-news-aggregator.csv')


In [19]:
print("{} observations \n{} features".format(dataset.shape[0], dataset.shape[1]))
print("{} unique sites from which the training data was extrapolated".format(len(dataset.HOSTNAME.unique())))

422419 observations 
8 features
11236 unique sites from which the training data was extrapolated


In [20]:
print(dataset.columns)

Index(['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME',
       'TIMESTAMP'],
      dtype='object')


In [21]:
# Pick our feature set (X) and what we are trying to predict (y)
X = dataset.TITLE
y = dataset.CATEGORY

In [22]:
print("\t\t Sample of X\n")
print(X.sample(5))
print("\n\n\t\t Sample of y (No relation to the X sample)\n")
print(y.sample(5))

		 Sample of X

254791    ECB says it is ready to take action if low inf...
394087    Facebook Forces iPhone Users To Separate Messa...
158243    FDA to hit e-cigarettes with ban on sales to a...
286386    Destiny is “Good Fit” for PC – Activision Publ...
93248     Peter Mayhew Set to Return as Chewbacca in Sta...
Name: TITLE, dtype: object


		 Sample of y (No relation to the X sample)

57900     t
180126    b
420392    e
145967    e
89686     e
Name: CATEGORY, dtype: object


In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [24]:
# Create a helper function perform preprocessing on data
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, sublinear_tf=True)
vectorizer.fit(dataset.TITLE)

def process_data(X, y, vectorizer):
    y = pd.get_dummies(y)
    X = vectorizer.transform(X)
    return X, y
    

In [28]:
X_train, y_train = process_data(X_train, y_train, vectorizer)
y_train.sample(5)

Unnamed: 0,b,e,m,t
254802,1,0,0,0
355755,0,0,1,0
133464,0,0,0,1
314011,0,0,1,0
83420,0,0,1,0


In [29]:
from keras.models import Sequential
from keras.layers import Dense

classifier = Sequential()

# Input layer
classifier.add(Dense(units = 50, activation='relu', 
                     kernel_initializer = 'uniform',
                     input_shape = (54637,)))

# Four Hidden Layers
classifier.add(Dense(units =50, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dense(units =50, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dense(units =50, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dense(units =50, activation='relu', 
                     kernel_initializer='uniform'))

# Output Layer. Four output nodes for our four classification types of news headlines.
classifier.add(Dense(units = 4, activation='sigmoid', 
                     kernel_initializer='uniform'))

classifier.compile(optimizer = 'rmsprop', loss ='categorical_crossentropy', metrics=['accuracy'])


In [30]:
classifier.fit(X_train, y_train, epochs=5, batch_size=100)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f17abc1e908>

In [31]:
# Create files to store model configuration and weights
# such that the model can be built back up from the files
# rather than having to retrain.
# Thanks to https://machinelearningmastery.com/save-load-keras-deep-learning-models/
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config_filename = "model_config" + timestr + ".json"
weights_filename = "model_weights" + timestr + ".h5"

# Serialize model to JSON
classifier_json = classifier.to_json()
with open(config_filename, "w") as json_file:
    json_file.write(classifier_json)

# Serialize weights to HDF5
classifier.save_weights(weights_filename)


In [32]:
X_test, y_test = process_data(X_test, y_test, vectorizer)

scores = classifier.evaluate(X_test, y_test)
print("Model accuracy on test data = {} ".format(scores[1]))

Model accuracy on test data = 0.9453032526868993 


In [33]:
# A function that takes in a prediction from our classifier and prints out the
# associated category
def get_predicted_headline_category(prediction):
    type_mapping = {0: 'Business', 1: 'Entertainment', 2: 'Health', 3: 'Science and Technology'}
    return type_mapping[prediction.argmax()]

In [43]:
# Scrape some top story article headlines and run them through our classifier
# Thanks to https://www.w3resource.com/python-exercises/basic/python-basic-1-exercise-8.php

import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen

# URLS of known topics
business_news_url="https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en&gl=US"
tech_news_url = "https://news.google.com/news/rss/headlines/section/topic/TECHNOLOGY?ned=us&hl=en&gl=US"
science_news_url = "https://news.google.com/news/rss/headlines/section/topic/SCIENCE?ned=us&hl=en&gl=US"
health_news_url = "https://news.google.com/news/rss/headlines/section/topic/HEALTH?ned=us&hl=en&gl=US"
entertainment_news_url = "https://news.google.com/news/rss/headlines/section/topic/ENTERTAINMENT?ned=us&hl=en&gl=US"

news_url = science_news_url

Client = urlopen(news_url)
xml_page = Client.read()
Client.close()

soup_page=soup(xml_page,"xml")
news_list=soup_page.findAll("item")

for news in news_list:
    # Store the headline string
    headline = news.title.text
    
    # Vectorize the headline string such that the classifier can make a prediction
    vectorized_headline = vectorizer.transform([headline])
    
    # Make a prediction and get the resulting category
    prediction = classifier.predict(vectorized_headline)
    predicted_category = get_predicted_headline_category(prediction)
    
    # Print [<Prediction>] <Headline>
    print("[{}] {} \n".format(predicted_category, headline))

[Science and Technology] SpaceX Not to Blame for Zuma Spy-Satellite Launch Failure: Report 

[Entertainment] An Ultra-Powerful Flare Erupted From Our Nearest Neighbor Star 

[Science and Technology] NASA Begins Building Next Mars Rover for 2020 Launch 

[Science and Technology] Hubble Telescope Discovers a Light-Bending 'Einstein Ring' in Space 

[Science and Technology] If 'Oumuamua Is an Alien Spacecraft, It's Keeping Quiet So Far 

[Science and Technology] Antarctic snowfall increasing, study finds 

[Science and Technology] Giant Ichthyosaur: 205-Million-Year-Old Jawbone Discovered From 'One of Largest Animals Ever' 

[Science and Technology] No, Buzz Aldrin didn't see a UFO on his way to the moon 

[Science and Technology] The new biggest marine reptile ever found 

[Science and Technology] Prehistoric Sea Monster Is One Of The Largest Animals Ever 

[Entertainment] The heart of the Milky Way teems with black holes 

[Science and Technology] NASA Sends Human Sperm to the Internati