In [None]:
from keras.models import Sequential
from keras.models import model_from_json
import numpy as np
import os
from sklearn.externals import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Load the saved vectorizer
vectorizer = joblib.load('vectorizer.pkl')

# Read in the saved configuration
json_file = open('model_config.json', 'r')
classifier_saved_configuration = json_file.read()
json_file.close()

# Create the ANN from the saved weights and configuration
classifier = model_from_json(classifier_saved_configuration)
classifier.load_weights("model_weights.h5")

# Create a Label Encoder for our headline categories
categories = ["b", "m", "t", "e"]
category_encoder = LabelEncoder()
category_encoder.fit(categories)

# Create a pipeline for predictions
pipeline = Pipeline(steps=[('vectorizer', vectorizer), ('classifier', classifier)])

In [None]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import pandas as pd

# URLS of known topics
business_news_url="https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en&gl=US"
tech_news_url = "https://news.google.com/news/rss/headlines/section/topic/TECHNOLOGY?ned=us&hl=en&gl=US"
science_news_url = "https://news.google.com/news/rss/headlines/section/topic/SCIENCE?ned=us&hl=en&gl=US"
health_news_url = "https://news.google.com/news/rss/headlines/section/topic/HEALTH?ned=us&hl=en&gl=US"
entertainment_news_url = "https://news.google.com/news/rss/headlines/section/topic/ENTERTAINMENT?ned=us&hl=en&gl=US"

# A dictionary with known topics mapped to a url from which we can harvest current headlines
topics_to_headlines_url_dict = {'Business': business_news_url, 
                                'Technology': tech_news_url,
                                'Science': science_news_url,
                                'Health': health_news_url,
                                'Entertainment': entertainment_news_url}

# A function to scrape current headlines and the google news category
def get_headlines(url):
    Client = urlopen(url)
    xml_page = Client.read()
    Client.close()
    
    soup_page = soup(xml_page, "xml")
    items = soup_page.findAll("item")
    title = soup_page.find("title")
    headlines = []
    categories = []
    for item in items:
        headlines.append(item.title.text)
        categories.append(title.text.split(' ')[0])
    
    df_headlines = pd.DataFrame({"Headline" : headlines, "Actual Category": categories})
    display("{} items in category {}".format(df_headlines.shape[0], categories[0]))   
    
    return pd.DataFrame({"Headline" : headlines, "Actual Category": categories})

In [None]:
# Scrape current google news headlines
# of all known categories into a dataframe
# for prediction
df_inference = pd.DataFrame()
for key, value in topics_to_headlines_url_dict.items():
    df_inference = df_inference.append(get_headlines(value), ignore_index=True)

    
# Cut out all except 20 of the science and technology category rows
# to bring it more in line with other categories
number_to_remove = df_inference.loc[df_inference['Actual Category'].isin(['Science', 'Technology'])].shape[0] - 20
df_inference = df_inference[~df_inference['Headline'].isin(df_inference.loc[df_inference['Actual Category'].isin(['Science', 'Technology'])].sample(number_to_remove)['Headline'])].reset_index(drop=True)

# A function to convert category names to
# the model representations of the category names
def convert_category_name(x):
    if x == "Business":
        return 'b'
    elif x == "Science" or x == "Technology":
        return 't'
    elif x == "Health":
        return 'm'
    elif x == "Entertainment":
        return 'e'
    
df_inference['Actual Category'] = df_inference['Actual Category'].apply(lambda x: convert_category_name(x))

# Add model predictions to the dataframe
df_inference['Predicted Category'] = category_encoder.inverse_transform([x.argmax() for x in pipeline.predict(df_inference.Headline.values)])

In [None]:
# Create a heatmap to display prediction accuracy
%matplotlib inline
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

with tf.Graph().as_default():
    cm = tf.confusion_matrix(category_encoder.transform(df_inference['Actual Category'].values),
                             category_encoder.transform(df_inference['Predicted Category']))
    with tf.Session() as session:
        cm_out = session.run(cm)
cm_out = cm_out.astype(float) / cm_out.sum(axis=1)[:, np.newaxis]
LABELS = ["Business", "Entertainment", "Health", "Science & Tech"]


sns.set(rc={'figure.figsize':(12,10),
            'xtick.labelsize':16,
            'ytick.labelsize': 16,
            'axes.labelsize': 20,
            'font.size': 20})

sns.heatmap(cm_out,
                 annot=True,
                 xticklabels=LABELS,
                 yticklabels=LABELS,
                 cmap='rocket_r')

plt.xlabel("Prediction")
plt.ylabel("Truth")
plt.show()

In [None]:
# List the errors made by the model
df_errors = df_inference[df_inference['Actual Category'] != df_inference['Predicted Category']]
pd.set_option('display.max_colwidth', 100)

# A function to center output in a dataframe
# This function is intended to be given to the styler
def center_fn(data):
    if data.ndim == 1:
        attr = "text-align: center"
        return [attr for x in data]

# A function to left align output in a dataframe
# This function is intended to be given to the styler
def left_fn(data):
    if data.ndim == 1:
        attr = "text-align: left"
        return [attr for x in data]

# A function to translate model readable category
# names to human readable category names
def translate_category(x):
    if x == "b":
        return 'Business'
    elif x == "t":
        return 'Science & Technology'
    elif x == "m":
        return 'Health'
    elif x == "e":
        return 'Entertainment'

pd.options.mode.chained_assignment = None # Prevents warnings that do not apply here
df_errors['Actual Category'] = df_errors['Actual Category'].map(lambda x: translate_category(x))
df_errors['Predicted Category'] = df_errors['Predicted Category'].apply(lambda x: translate_category(x)) 

styler = df_errors.style.apply(center_fn, subset=['Actual Category', 'Predicted Category'])
styler.apply(left_fn, subset=['Headline'])
styler.hide_index()
styler.set_properties(**{"font-size": "150%"})
styler.set_table_styles([{'selector': 'th','props': [('text-align', 'center'), ('font-size', '150%')]},
                         {'selector': 'caption','props': [('text-align', 'center'), ('font-size', '200%')]}])
styler.set_caption("Misclassified Headlines")
display(styler)