In [1]:
from pathlib import Path
from zipfile import ZipFile
import urllib.request
import pandas as pd

datafile = Path("./newsCorpora.csv")
datazipfile = Path("./NewsAggregatorDataset.zip")
urlstring = "http://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip"

if not datafile.exists():
    if not datazipfile.exists():
        datazipfile,_ = urllib.request.urlretrieve(urlstring)
        print(datazipfile)
    with ZipFile(datazipfile, 'r') as zip:
            zip.extractall()
        


dataset = pd.read_csv(datafile, sep='\t', names=['TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# Create a helper function perform preprocessing on data
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, sublinear_tf=True)
vectorizer.fit(dataset.TITLE)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [2]:
from keras.models import Sequential
from keras.models import model_from_json
import os
import numpy as np

# Read in the saved configuration
json_file = open('model_config_20180411-170208.json', 'r')
classifier_saved_configuration = json_file.read()
json_file.close()

classifier = model_from_json(classifier_saved_configuration)
classifier.load_weights("model_weights_20180411-170208.h5")

classifier.compile(optimizer = 'adam', loss ='categorical_crossentropy', metrics=['accuracy'])


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# A function that takes in a prediction from our classifier and prints out the
# associated category
def get_predicted_headline_category(prediction):
    type_mapping = {0: 'Business', 1: 'Entertainment', 2: 'Health', 3: 'Science and Technology'}
    return type_mapping[prediction.argmax()]

In [7]:
# Scrape some top story article headlines and run them through our classifier
# Thanks to https://www.w3resource.com/python-exercises/basic/python-basic-1-exercise-8.php

import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen

# URLS of known topics
business_news_url="https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en&gl=US"
tech_news_url = "https://news.google.com/news/rss/headlines/section/topic/TECHNOLOGY?ned=us&hl=en&gl=US"
science_news_url = "https://news.google.com/news/rss/headlines/section/topic/SCIENCE?ned=us&hl=en&gl=US"
health_news_url = "https://news.google.com/news/rss/headlines/section/topic/HEALTH?ned=us&hl=en&gl=US"
entertainment_news_url = "https://news.google.com/news/rss/headlines/section/topic/ENTERTAINMENT?ned=us&hl=en&gl=US"

news_url = entertainment_news_url

Client = urlopen(news_url)
xml_page = Client.read()
Client.close()

soup_page=soup(xml_page,"xml")
news_list=soup_page.findAll("item")

for news in news_list:
    # Store the headline string
    headline = news.title.text
    
    # Vectorize the headline string such that the classifier can make a prediction
    vectorized_headline = vectorizer.transform([headline])
    
    # Make a prediction and get the resulting category
    prediction = classifier.predict(vectorized_headline)
    predicted_category = get_predicted_headline_category(prediction)
    
    # Print [<Prediction>] <Headline>
    print("[{}] {}\n".format(predicted_category, headline))

[Entertainment] Tristan Thompson: Everything to Know About Khloé Kardashian's Boyfriend Caught in Cheating Scandal

[Entertainment] Mariah Carey and bipolar disorder: What is it and how is it treated?

[Entertainment] Mitzi Shore, Owner of The Comedy Store and Maker and Breaker of Careers, Dies at 87

[Entertainment] This Week's Roseanne Finally Felt Like the Old Roseanne

[Entertainment] TV Ratings: 'Roseanne' Stays on Top With 13.5 Million Viewers

[Entertainment] Emily Blunt on 'Mary Poppins Returns,' 'A Quiet Place' and 'Edge of Tomorrow 2'

[Entertainment] 'Avengers: Infinity War' Beating 'Black Panther' to Set New Presales Record

[Entertainment] Ahead Of Her Return, Carrie Underwood's 'Cry Pretty' Does Some Heavy Lifting

[Entertainment] Brad Pitt and Neri Oxman: Inside Their Romance

[Science and Technology] Cardi B's “Invasion of Privacy” Is as Studious as It Is Bombastic

[Entertainment] Seth Meyers slices and dices Trump's latest legal turmoil

[Entertainment] Before the mov