In [None]:
from keras.models import Sequential
from keras.models import model_from_json
import numpy as np
import os
from sklearn.externals import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Load the saved vectorizer
vectorizer = joblib.load('vectorizer.pkl')

# Read in the saved configuration
json_file = open('model_config.json', 'r')
classifier_saved_configuration = json_file.read()
json_file.close()

# Create the ANN from the saved weights and configuration
classifier = model_from_json(classifier_saved_configuration)
classifier.load_weights("model_weights.h5")

# Create a Label Encoder for our headline categories
categories = ["b", "m", "t", "e"]
category_encoder = LabelEncoder()
category_encoder.fit(categories)

# Create a pipeline for predictions
pipeline = Pipeline(steps=[('vectorizer', vectorizer), ('classifier', classifier)])

In [None]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import pandas as pd

# URLS of known topics
business_news_url="https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en&gl=US"
tech_news_url = "https://news.google.com/news/rss/headlines/section/topic/TECHNOLOGY?ned=us&hl=en&gl=US"
science_news_url = "https://news.google.com/news/rss/headlines/section/topic/SCIENCE?ned=us&hl=en&gl=US"
health_news_url = "https://news.google.com/news/rss/headlines/section/topic/HEALTH?ned=us&hl=en&gl=US"
entertainment_news_url = "https://news.google.com/news/rss/headlines/section/topic/ENTERTAINMENT?ned=us&hl=en&gl=US"

# A dictionary with known topics mapped to a url from which we can harvest current headlines
topics_to_headlines_url_dict = {'Business': business_news_url, 
                                'Technology': tech_news_url,
                                'Science': science_news_url,
                                'Health': health_news_url,
                                'Entertainment': entertainment_news_url}

# A function to scrape current headlines and the google news category
def get_headlines(url):
    Client = urlopen(url)
    xml_page = Client.read()
    Client.close()
    
    soup_page = soup(xml_page, "xml")
    items = soup_page.findAll("item")
    category = soup_page.find("category")
    headlines = []
    categories = []
    for item in items:
        headlines.append(item.title.text)
        categories.append(category.text)
        
    return pd.DataFrame({"Headline" : headlines, "Category": categories})

In [None]:
# Scrape current google news headlines
# of all known categories into a dataframe
# for prediction
df_google = pd.DataFrame()
for key, value in topics_to_headlines_url_dict.items():
    df_google = df_google.append(get_headlines(value), ignore_index=True)

def convert_google_category(x):
    if x == "Business":
        return 'b'
    elif x == "Science" or x == "Technology":
        return 't'
    elif x == "Health":
        return 'm'
    elif x == "Entertainment":
        return 'e'
    
df_google.Category = df_google.Category.apply(lambda x: convert_google_category(x))

# Add model predictions to the dataframe
df_google['Model Prediction'] = category_encoder.inverse_transform([x.argmax() for x in pipeline.predict(df_google.Headline.values)])

In [None]:
# Create a heatmap to display prediction accuracy

import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

with tf.Graph().as_default():
    cm = tf.confusion_matrix(category_encoder.transform(df_google.Category.values),
                             category_encoder.transform(df_google['Model Prediction']))
    with tf.Session() as session:
        cm_out = session.run(cm)
cm_out = cm_out.astype(float) / cm_out.sum(axis=1)[:, np.newaxis]
LABELS = ["Business", "Entertainment", "Health", "Science & Tech"]


sns.set(rc={'figure.figsize':(12,10),
            'xtick.labelsize':16,
            'ytick.labelsize': 16,
            'axes.labelsize': 20,
            'font.size': 20})

sns.heatmap(cm_out,
                 annot=True,
                 xticklabels=LABELS,
                 yticklabels=LABELS,
                 cmap='rocket_r')

plt.xlabel("Prediction")
plt.ylabel("Truth")

In [None]:
# List the errors made by the model
df_errors = df_google[df_google['Category'] != df_google['Model Prediction']]

pd.set_option('display.max_colwidth', 100)
df_errors.style.set_table_styles([{'selector': 'tr','props': [('text-align', 'center')]}])