In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('uci-news-aggregator.csv')


In [2]:
print("{} observations \n{} features".format(dataset.shape[0], dataset.shape[1]))
print("{} unique sites from which the training data was extrapolated".format(len(dataset.HOSTNAME.unique())))

422419 observations 
8 features
11236 unique sites from which the training data was extrapolated


In [3]:
print(dataset.columns)

Index(['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME',
       'TIMESTAMP'],
      dtype='object')


In [4]:
# Pick our feature set (X) and what we are trying to predict (y)
X = dataset.TITLE
y = dataset.CATEGORY

In [5]:
print("\t\t Sample of X\n")
print(X.sample(5))
print("\n\n\t\t Sample of y (No relation to the X sample)\n")
print(y.sample(5))

		 Sample of X

133484                    Try on Dummy Google Glass for $50
47662     Candy Crush maker King Digital loses US$850m o...
17895      Google, Viacom Resolve YouTube Copyright Lawsuit
352668    Dow 101: Dow breaks 17000 for first time; what...
45281     Facebook places $2bn bet on VR headset maker O...
Name: TITLE, dtype: object


		 Sample of y (No relation to the X sample)

241600    t
412212    b
24219     b
33143     e
230110    b
Name: CATEGORY, dtype: object


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
# Create a helper function perform preprocessing on data
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, sublinear_tf=True)
vectorizer.fit(dataset.TITLE)

def process_data(X, y, vectorizer):
    y = pd.get_dummies(y_train, columns=['CATEGORY'])
    X = vectorizer.transform(X)
    return X, y
    

In [8]:
# Split the categories into a binary classification vector
#y_train = pd.get_dummies(y_train, columns=['CATEGORY'])

In [9]:
#from sklearn.feature_extraction.text import TfidfVectorizer

#vectorizer = TfidfVectorizer(max_df=0.5, sublinear_tf=True)
#X = vectorizer.fit_transform(dataset.TITLE)

In [10]:
print(X.shape)

(422419,)


In [11]:
X_train, y_train = process_data(X_train, y_train, vectorizer)
y_train.sample(5)

Unnamed: 0,b,e,m,t
262079,0,0,1,0
157083,0,1,0,0
193882,0,1,0,0
311617,0,0,0,1
229906,1,0,0,0


In [None]:
from keras.models import Sequential
from keras.layers import Dense

classifier = Sequential()

# Input layer
classifier.add(Dense(units = 50, activation='relu', 
                     kernel_initializer = 'uniform',
                     input_shape = (54637,)))

# Four Hidden Layers
classifier.add(Dense(units =50, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dense(units =50, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dense(units =50, activation='relu', 
                     kernel_initializer='uniform'))
classifier.add(Dense(units =50, activation='relu', 
                     kernel_initializer='uniform'))

# Output Layer. Four output nodes for our four classification types of news headlines.
classifier.add(Dense(units = 4, activation='sigmoid', 
                     kernel_initializer='uniform'))

classifier.compile(optimizer = 'rmsprop', loss ='categorical_crossentropy', metrics=['accuracy'])


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
classifier.fit(X_train, y_train, epochs=7, batch_size=100)

Epoch 1/7
Epoch 2/7

In [None]:
# Create files to store model configuration and weights
# such that the model can be built back up from the files
# rather than having to retrain.
# Thanks to https://machinelearningmastery.com/save-load-keras-deep-learning-models/
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config_filename = "model_config" + timestr + ".json"
weights_filename = "model_weights" + timestr + ".h5"

# Serialize model to JSON
classifier_json = clasifier.to_json()
with open(config_filename, "w") as json_file:
    json_file.write(classifier_json)

# Serialize weights to HDF5
classifier.save_weights(weights_filename)


In [None]:
X_test, y_test = process_data(X_test, y_test, vectorizer)

scores = classifier.evaluate(X_test, y_test)
print("Model accuracy on test data = {} ".format(scores[1]))

In [None]:
# A function that takes in a prediction from our classifier and prints out the
# associated category
def get_predicted_headline_category(prediction):
    type_mapping = {0: 'Business', 1: 'Entertainment', 2: 'Health', 3: 'Science and Technology'}
    return type_mapping[prediction.argmax()]

In [None]:
# Scrape some top story article headlines and run them through our classifier
# Thanks to https://www.w3resource.com/python-exercises/basic/python-basic-1-exercise-8.php

import bs4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen

# URLS of known topics
business_news_url="https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en&gl=US"
tech_news_url = "https://news.google.com/news/rss/headlines/section/topic/TECHNOLOGY?ned=us&hl=en&gl=US"
science_news_url = "https://news.google.com/news/rss/headlines/section/topic/SCIENCE?ned=us&hl=en&gl=US"
health_news_url = "https://news.google.com/news/rss/headlines/section/topic/HEALTH?ned=us&hl=en&gl=US"
entertainment_news_url = "https://news.google.com/news/rss/headlines/section/topic/ENTERTAINMENT?ned=us&hl=en&gl=US"

news_url = health_news_url

Client = urlopen(news_url)
xml_page = Client.read()
Client.close()

soup_page=soup(xml_page,"xml")
news_list=soup_page.findAll("item")

for news in news_list:
    # Store the headline string
    headline = news.title.text
    
    # Vectorize the headline string such that the classifier can make a prediction
    vectorized_headline = vectorizer.transform([headline])
    
    # Make a prediction and get the resulting category
    prediction = classifier.predict(vectorized_headline)
    predicted_category = get_predicted_headline_category(prediction)
    
    # Print [<Prediction>] <Headline>
    print("[{}] {} \n".format(predicted_category, headline))