In [1]:
from keras.models import Sequential
from keras.models import model_from_json
import numpy as np
import os
from sklearn.externals import joblib

# Load the saved vectorizer
vectorizer = joblib.load('vectorizer.pkl')

# Read in the saved configuration
json_file = open('model_config.json', 'r')
classifier_saved_configuration = json_file.read()
json_file.close()

# Create the ANN from the saved weights and configuration
classifier = model_from_json(classifier_saved_configuration)
classifier.load_weights("model_weights.h5")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# A function that takes in a prediction from our classifier and prints out the
# associated category
def get_predicted_headline_category(prediction):
    type_mapping = {0: 'Business', 1: 'Entertainment', 2: 'Health', 3: 'Science and Technology'}
    return type_mapping[prediction.argmax()]

# A function to scrape headlines
# Thanks to https://www.w3resource.com/python-exercises/basic/python-basic-1-exercise-8.php
def get_headlines(url):
    from bs4 import BeautifulSoup as soup
    from urllib.request import urlopen
    Client = urlopen(url)
    xml_page = Client.read()
    Client.close()

    soup_page=soup(xml_page,"xml")
    items = soup_page.findAll("item")
    headlines = []
    for item in items:
        headlines.append(item.title.text)
    return headlines

# A function to get a prediction from a headline text
def get_prediction(headline):
    # Vectorize the headline string such that the classifier can make a prediction
    vectorized_headline = vectorizer.transform([headline])
    
    # Have the model make a prediction
    return classifier.predict(vectorized_headline)

# A function to print a prediction category along with the headline
def print_prediction_category(prediction, headline):
    predicted_category = get_predicted_headline_category(prediction)
    
    # Print [<Prediction>] <Headline>
    print("[{}] {}\n".format(predicted_category, headline))

# A function to input a list of headlines and print out the category name and the headline for each input    
def batch_prediction(headline_list):
    for headline in headline_list:
        print_prediction_category(get_prediction(headline), headline)

In [3]:
# URLS of known topics
business_news_url="https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en&gl=US"
tech_news_url = "https://news.google.com/news/rss/headlines/section/topic/TECHNOLOGY?ned=us&hl=en&gl=US"
science_news_url = "https://news.google.com/news/rss/headlines/section/topic/SCIENCE?ned=us&hl=en&gl=US"
health_news_url = "https://news.google.com/news/rss/headlines/section/topic/HEALTH?ned=us&hl=en&gl=US"
entertainment_news_url = "https://news.google.com/news/rss/headlines/section/topic/ENTERTAINMENT?ned=us&hl=en&gl=US"

In [4]:
# Test the loaded model on ground truth data
news_dict = {'Business': business_news_url, 'Technology': tech_news_url, 'Science': science_news_url, 
             'Health': health_news_url, 'Entertainment': entertainment_news_url}

for key, value in news_dict.items():
    print("===========================  {}  ========================================\n".format(key))
    batch_prediction(get_headlines(value))
    print("\n==========================================================================\n\n".format(key))


[Science and Technology] The unspoken factor in Amazon's search for a new home: Jeff Bezos' support for gay rights

[Business] US Growth Cools to 2.3% While Compensation Costs Accelerate

[Business] Amazon Surges After Stunning Wall Street With Its Near-Perfect Quarter

[Business] Exxon Mobil shares fall 3.5% on earnings miss fueled by weak refining and chemicals profits

[Business] Nasdaq futures surge on strong earnings from Amazon, Microsoft

[Business] Wall Street Set for Tech Boost as FAANGs Surge After Amazon, Microsoft Earnings

[Science and Technology] Jury Hits Pork Giant for $50M for Hog Operation's Nuisance

[Science and Technology] 6 Reasons Ford CEO Jim Hackett Decided To Deep-Six Its Sedans

[Science and Technology] Report: T-Mobile and Sprint may agree to merger as soon as next week

[Science and Technology] 'Extremely drunk' woman smashed Popeyes window when $4 deal didn't come with soda, NY cops say

[Business] UK economy in weakest growth since 2012

[Business] Worke

In [5]:
np.set_printoptions(precision=3, suppress=True)
get_prediction("Orlando area may get Wyndham Worldwide headquarters, 200 jobs")


array([[0.734, 0.064, 0.087, 0.115]], dtype=float32)