# Assignment <span style="color:red">option Four</span> - News Categorization  using PyTorch 
Download the dataset from https://www.kaggle.com/uciml/news-aggregator-dataset and develop a news classification or categorization model. The dataset contain only titles of a news item and some metadata. The categories of the news items include one of: –<span  style="color:red"> b</span> : business – <span  style="color:red">t</span> : science and technology – <span  style="color:red">e</span> : entertainment and –<span  style="color:red">m</span> : health. 

1. Prepare training and test dataset: Split the data into training and test set (80% train and 20% test). Make sure they are balanced, otherwise if all <span  style="color:red">b</span> files are on training, your model fails to predict <span  style="color:red">t</span> files in test.
2. Binary classification: produce training data for each two categories, such as <span  style="color:red">b </span> and <span  style="color:red"> t</span>, <span  style="color:red">b</span> and <span  style="color:red"> m</span>, <span  style="color:red">e</span> and <span  style="color:red">t</span> and so on. Evaluate the performance and report which categories are easier for the models.
3. Adapt the Text Categorization PyTorch code (see above) and evaluate the performance of the system for these task
4. Use a pre-trained embeddings and compare your result. When you use pre-ttrained mebeddings, you have to average the word embeddings of each tokens in ach document to get the unique representation of the document. DOC_EMBEDDING = (TOKEN1_EMBEDDING + ... + TOKENn_EMBEDDING). You can also use some of the <span  style="color:red">spacy/FLAIR </span>document embedding methods
6. Report the recall, precision, and F1 scores for both binary and multi-class classification.
 

# Multiclassification FLASK API

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

df = pd.read_csv('uci-news-aggregator.csv')

del df['URL']
del df['PUBLISHER']
del df['STORY']
del df['HOSTNAME']
del df['TIMESTAMP']
del df['ID']

In [50]:
df.CATEGORY.replace('b','business', inplace=True)
df.CATEGORY.replace('t','science and technology', inplace=True)
df.CATEGORY.replace('e','entertainment', inplace=True)
df.CATEGORY.replace('m','health', inplace=True)

In [51]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
labels = lb_make.fit_transform(df.CATEGORY)

In [52]:
df['labels'] = labels

In [53]:
import numpy as np
from sklearn.model_selection import train_test_split

train, test =  train_test_split(df, test_size=0.20, random_state=42)

In [57]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

# The maximum number of most frequent words to be used.
MAX_NB_WORDS = 10000
# Max number of words in each row
MAX_SEQUENCE_LENGTH = 150

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(df['TITLE'].values)
word2index = tokenizer.word_index
print('Found %s unique tokens.' % len(word2index))

Found 75285 unique tokens.


In [32]:
def pad_sequence(texts):
    batches = []
    for text in texts:
        layer = np.zeros(total_words,dtype=float)

        for word in text_to_word_sequence(text):
            try:
                word2index[word.lower()]
                layer[word2index[word.lower()]] += 1
            except:
                print("word is not included")            
        batches.append(layer)
    
    return batches

In [43]:
import torch.nn as nn
import torch

learning_rate = 0.1
num_epochs = 1
batch_size = 2000
display_step = 1
total_words = len(word2index)
hidden_size = 64 
input_size = total_words
num_classes = 4

class ANN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(ANN, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)

    def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

##### We could observe that the loss is not decreasing under 0.08. Therefore, we earlystop after 3 Epochs

In [58]:
model = ANN(input_size, hidden_size, num_classes)
model.load_state_dict(torch.load('mode.pth'))
model.eval()

ANN(
  (layer_1): Linear(in_features=75285, out_features=64, bias=True)
  (relu): ReLU()
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (output_layer): Linear(in_features=64, out_features=4, bias=True)
)

## FLASK API - POST REQUESTS

In [106]:
labels_news = {'0':'business', '1':'entertainment', 
               '2':'health', '3':'science and technology'}

def predict_news(text):
    padded_text = pad_sequence([text])
    torch_pad_text = torch.FloatTensor(padded_text)
    output = model(torch_pad_text)
    _, pred = torch.max(output.data, 1)
    pred = str(int(pred))
    
    return labels_news[pred]

In [107]:
from flask import Flask, render_template, request, abort, jsonify

app = Flask(__name__)

@app.route("/api", methods=["POST"])
def get_json():
    if request.method == "POST":
        if request.json:
            request_json = request.json
            if 'text' in request_json:
                json_result = predict_news(request_json['text'])
                return jsonify(json_result)
            abort(400, 'JSON data missing text field.')
        abort(415)
    abort(405)
    
if __name__ == '__main__':
    from werkzeug.serving import run_simple
    run_simple('localhost', 9000, app)

 * Running on http://localhost:9000/ (Press CTRL+C to quit)
127.0.0.1 - - [28/Nov/2021 19:40:09] "POST /api HTTP/1.1" 200 -


word is not included


127.0.0.1 - - [28/Nov/2021 19:40:12] "POST /api HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2021 19:41:34] "POST /api HTTP/1.1" 200 -


word is not included
