In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from zipfile import ZipFile
import urllib.request


datafile = Path("./newsCorpora.csv")
datazipfile = Path("./NewsAggregatorDataset.zip")
urlstring = "http://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip"

if not datafile.exists():
    if not datazipfile.exists():
        datazipfile,_ = urllib.request.urlretrieve(urlstring)
        print(datazipfile)
    with ZipFile(datazipfile, 'r') as zip:
            zip.extractall()
        


dataset = pd.read_csv(datafile, sep='\t', names=['TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

In [None]:
print("{} observations".format(dataset.shape[0]))
print("{} unique sites from which the training data was extrapolated".format(len(dataset.HOSTNAME.unique())))

In [None]:
# Pick our feature set (X) and what we are trying to predict (y)
X = dataset.TITLE
y = dataset.CATEGORY

In [None]:
print("\t\t Sample of X\n")
print(X.sample(5))
print("\n\n\t\t Sample of y (No relation to the X sample)\n")
print(y.sample(5))

In [None]:
# Split the data randomly for validation
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Create a helper function perform preprocessing on data
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, sublinear_tf=True)
vectorizer.fit(dataset.TITLE)

def process_data(X, y, vectorizer):
    y = pd.get_dummies(y)
    X = vectorizer.transform(X)
    return X, y
    

In [None]:
# Process the training data
X_train, y_train = process_data(X_train, y_train, vectorizer)

In [None]:
# Inspect the processed training data
print("X_train.shape = {}\n\n".format(X_train.shape))
print(y_train.sample(5))

In [None]:
# Create the ANN
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

classifier = Sequential()

# Input layer
classifier.add(Dense(units = 512, activation='relu', 
                     kernel_initializer = 'uniform',
                     input_shape = (54637,)))

# Four Hidden Layers
classifier.add(Dense(units =512, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = 0.4))
classifier.add(Dense(units =128, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = 0.2))
classifier.add(Dense(units =128, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = 0.4))
classifier.add(Dense(units =128, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dropout(rate = 0.4))

# Output Layer. Four output nodes for our four classification types of news headlines.
classifier.add(Dense(units = 4, activation='softmax', 
                     kernel_initializer='uniform'))

classifier.compile(optimizer = 'adam', loss ='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Fit the ANN to the training data
classifier.fit(X_train, y_train, epochs=5, batch_size=128)

In [None]:
# Process the test data for validation
X_test, y_test = process_data(X_test, y_test, vectorizer)

In [None]:
# Validate the ANN
scores = classifier.evaluate(X_test, y_test)
print("Model accuracy on test data = {} ".format(scores[1]))

In [None]:
# Create files to store model configuration and weights
# such that the model can be built back up from the files
# rather than having to retrain.
# Thanks to https://machinelearningmastery.com/save-load-keras-deep-learning-models/
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config_filename = "model_config_" + timestr + ".json"
weights_filename = "model_weights_" + timestr + ".h5"

# Serialize model to JSON
classifier_json = classifier.to_json()
with open(config_filename, "w") as json_file:
    json_file.write(classifier_json)

# Serialize weights to HDF5
classifier.save_weights(weights_filename)

In [None]:
# A function that takes in a prediction from our classifier and prints out the
# associated category
def get_predicted_headline_category(prediction):
    type_mapping = {0: 'Business', 1: 'Entertainment', 2: 'Health', 3: 'Science and Technology'}
    return type_mapping[prediction.argmax()]

In [None]:
# Scrape some top story article headlines and run them through our classifier
# Thanks to https://www.w3resource.com/python-exercises/basic/python-basic-1-exercise-8.php

import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen

# URLS of known topics
business_news_url="https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en&gl=US"
tech_news_url = "https://news.google.com/news/rss/headlines/section/topic/TECHNOLOGY?ned=us&hl=en&gl=US"
science_news_url = "https://news.google.com/news/rss/headlines/section/topic/SCIENCE?ned=us&hl=en&gl=US"
health_news_url = "https://news.google.com/news/rss/headlines/section/topic/HEALTH?ned=us&hl=en&gl=US"
entertainment_news_url = "https://news.google.com/news/rss/headlines/section/topic/ENTERTAINMENT?ned=us&hl=en&gl=US"

news_url = tech_news_url

Client = urlopen(news_url)
xml_page = Client.read()
Client.close()

soup_page=soup(xml_page,"xml")
news_list=soup_page.findAll("item")

for news in news_list:
    # Store the headline string
    headline = news.title.text
    
    # Vectorize the headline string such that the classifier can make a prediction
    vectorized_headline = vectorizer.transform([headline])
    
    # Make a prediction and get the resulting category
    prediction = classifier.predict(vectorized_headline)
    predicted_category = get_predicted_headline_category(prediction)
    
    # Print [<Prediction>] <Headline>
    print("[{}] {}\n".format(predicted_category, headline))

In [None]:
headline = "Orlando area may get Wyndham Worldwide headquarters, 200 jobs"
# Vectorize the headline string such that the classifier can make a prediction
vectorized_headline = vectorizer.transform([headline])
    
# Make a prediction and get the resulting category
prediction = classifier.predict(vectorized_headline)
predicted_category = get_predicted_headline_category(prediction)
    
# Print [<Prediction>] <Headline>
print("[{}] {} \n".format(predicted_category, headline))
