In [None]:
from keras.models import Sequential
from keras.models import model_from_json
import numpy as np
import os
from sklearn.externals import joblib

# Load the saved vectorizer
vectorizer = joblib.load('vectorizer.pkl')

# Read in the saved configuration
json_file = open('model_config.json', 'r')
classifier_saved_configuration = json_file.read()
json_file.close()

# Create the ANN from the saved weights and configuration
classifier = model_from_json(classifier_saved_configuration)
classifier.load_weights("model_weights.h5")

# A function that takes in a prediction from our classifier and prints out the
# associated category
def get_predicted_headline_category(prediction):
    type_mapping = {0: 'Business', 1: 'Entertainment', 2: 'Health', 3: 'Science and Technology'}
    return type_mapping[prediction.argmax()]

# A function to scrape headlines
# Thanks to https://www.w3resource.com/python-exercises/basic/python-basic-1-exercise-8.php
def get_headlines(url):
    from bs4 import BeautifulSoup as soup
    from urllib.request import urlopen
    Client = urlopen(url)
    xml_page = Client.read()
    Client.close()

    soup_page=soup(xml_page,"xml")
    items = soup_page.findAll("item")
    headlines = []
    for item in items:
        headlines.append(item.title.text)
    return headlines

# A function to get a prediction from a headline text
def get_prediction(headline):
    # Vectorize the headline string such that the classifier can make a prediction
    vectorized_headline = vectorizer.transform([headline])
    
    # Have the model make a prediction
    return classifier.predict(vectorized_headline)

# A function to print a prediction category along with the headline
def print_prediction_category(prediction, headline):
    predicted_category = get_predicted_headline_category(prediction)
    
    # Print [<Prediction>] <Headline>
    print("[{}] {}\n".format(predicted_category, headline))

# A function to input a list of headlines and print out the category name and the headline for each input    
def batch_prediction(headline_list):
    for headline in headline_list:
        print_prediction_category(get_prediction(headline), headline)
        
        
# URLS of known topics
business_news_url="https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en&gl=US"
tech_news_url = "https://news.google.com/news/rss/headlines/section/topic/TECHNOLOGY?ned=us&hl=en&gl=US"
science_news_url = "https://news.google.com/news/rss/headlines/section/topic/SCIENCE?ned=us&hl=en&gl=US"
health_news_url = "https://news.google.com/news/rss/headlines/section/topic/HEALTH?ned=us&hl=en&gl=US"
entertainment_news_url = "https://news.google.com/news/rss/headlines/section/topic/ENTERTAINMENT?ned=us&hl=en&gl=US"

In [None]:
# Retrieves current headlines from the known topics urls (Google News) and outputs the
# the section from which the groups of headlines came from in the format:
#
# ===========================  <Actual Classification of Articles>  ========================================
#
# [model predicted classification] <headline_1>
# [model predicted classification] <headline_2>
# [model predicted classification] <headline_3>
# ...
# [model predicted classification] <headline_n>
#
# ==========================================================================
news_dict = {'Business': business_news_url, 'Technology': tech_news_url, 'Science': science_news_url, 
             'Health': health_news_url, 'Entertainment': entertainment_news_url}

for key, value in news_dict.items():
    print("===========================  {}  ========================================\n".format(key))
    batch_prediction(get_headlines(value))
    print("\n==========================================================================\n\n".format(key))

In [None]:
# Demonstrates the raw predictions output by the prediction model.
# The output is a row vector which contains the probabilities the model infers
# the input headline to be between the categories ["Business", "Entertainment", "Health", "Science and Technology"]
# In the above cell, we simply took from a prediction like this the highest probability and displayed
# that as the predicted classification.

np.set_printoptions(precision=3, suppress=True)
get_prediction("Orlando area may get Wyndham Worldwide headquarters, 200 jobs")
