In [2]:
import torch
import pandas as pd
import numpy as np
import sklearn
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

In [3]:
# get sub categories
categories = ["comp.graphics","sci.space","rec.sport.baseball","talk.politics.guns", "soc.religion.christian"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('total texts in train:',len(newsgroups_train.data))
print('total texts in test:',len(newsgroups_test.data))

total texts in train: 2919
total texts in test: 1942


In [4]:
# Getting all the vocabularies and indexing to a unique position
vocab = Counter()
#Indexing words from the training data
for text in newsgroups_train.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

#Indexing words from the test data
for text in newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i

    return word2index

word2index = get_word_2_index(vocab)

In [5]:
print(len(word2index))
print(word2index["the"]) # Showing the index of 'the'
print (total_words)

196609
72
196609


In [6]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    # Split into different batchs, get the next batch
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    # get the targets
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    for text in texts:
        # Dimension, 196609
        layer = np.zeros(total_words,dtype=float)

        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
        batches.append(layer)

    # We have 5 categories
    for category in categories:
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        elif category == 2:
            index_y = 2
        elif category == 3:
            index_y = 3
        else:
            index_y = 4
        results.append(index_y)

    # the training and the targets
    return np.array(batches),np.array(results)

In [7]:
# Parameters
learning_rate = 0.01
num_epochs = 10
batch_size = 150
display_step = 1

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features
input_size = total_words # Words in vocab
num_classes = 5         # Categories: "graphics","space","baseball","guns", "christian"

In [8]:
import torch.nn as nn

In [9]:
# define the network
class News_20_Net(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(News_20_Net, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True)
        self.relu = nn.ReLU()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True)
    # accept input and return an output
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

In [12]:
news_net = News_20_Net(input_size, hidden_size, num_classes)
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  # This includes the Softmax loss function
optimizer = torch.optim.Adam(news_net.parameters(), lr=learning_rate)

# Train the Model
for epoch in range(num_epochs):
    # determine the number of min-batches based on the batch size and size of training data
    total_batch = int(len(newsgroups_train.data)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y = get_batch(newsgroups_train,i,batch_size)
        articles = torch.FloatTensor(batch_x)
        labels = torch.LongTensor(batch_y)
        #print("articles",articles)
        #print(batch_x, labels)
        #print("size labels",labels.size())

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = news_net(articles)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % 4 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
                   %(epoch+1, num_epochs, i+1,
                     len(newsgroups_train.data)/batch_size, loss.data))

KeyboardInterrupt: 

In [None]:
#show the different trained parameters
for name, param in news_net.named_parameters():
    if param.requires_grad:
        print ("Name--->",name,"\nValues--->", param.data)

In [None]:
# Test the Model
correct = 0
total = 0
total_test_data = len(newsgroups_test.target)
# get all the test dataset and test them
batch_x_test,batch_y_test = get_batch(newsgroups_test,0,total_test_data)
articles = torch.FloatTensor(batch_x_test)
labels = torch.LongTensor(batch_y_test)
outputs = news_net(articles)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
print('Accuracy of the network on the 1180 test articles: %d %%' % (100 * correct // total))

# Assignment <span style="color:red">option Four</span> - News Categorization  using PyTorch
Download the dataset from https://www.kaggle.com/uciml/news-aggregator-dataset and develop a news classification or categorization model. The dataset contain only titles of a news item and some metadata. The categories of the news items include one of: –<span  style="color:red"> b</span> : business – <span  style="color:red">t</span> : science and technology – <span  style="color:red">e</span> : entertainment and –<span  style="color:red">m</span> : health.

1. Prepare training and test dataset: Split the data into training and test set (80% train and 20% test). Make sure they are balanced, otherwise if all <span  style="color:red">b</span> files are on training, your model fails to predict <span  style="color:red">t</span> files in test.
2. Binary classification: produce training data for each two categories, such as <span  style="color:red">b </span> and <span  style="color:red"> t</span>, <span  style="color:red">b</span> and <span  style="color:red"> m</span>, <span  style="color:red">e</span> and <span  style="color:red">t</span> and so on. Evaluate the performance and report which categories are easier for the models.
3. Adapt the Text Categorization PyTorch code (see above) and evaluate the performance of the system for these task
4. Use a pre-trained embeddings and compare your result. When you use pre-ttrained embeddings, you have to average the word embeddings of each tokens in ach document to get the unique representation of the document. DOC_EMBEDDING = (TOKEN1_EMBEDDING + ... + TOKENn_EMBEDDING). You can also use some of the <span  style="color:red">spacy/FLAIR </span>document embedding methods
6. Report the recall, precision, and F1 scores for both binary and multi-class classification.

In [None]:
# Prepare training and test dataset: Split the data into training and test set (80% train and 20% test). Make sure they are balanced, otherwise if all b files are on training, your model fails to predict t files in test.
from sklearn.model_selection import train_test_split
path = './uci-news-aggregator.csv'
df = pd.read_csv(path)
train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['CATEGORY'])

In [None]:
# Generate all possible combinations of two categories from the set b, t, e, m,
# produce training data for each combination and evaluate the model on the test data.
combinations = [('b', 't'), ('b', 'e'), ('b', 'm'), ('t', 'e'), ('t', 'm'), ('e', 'm')]
for c in combinations:
    train_c = train[(train['CATEGORY'] == c[0]) | (train['CATEGORY'] == c[1])]
    test_c = test[(test['CATEGORY'] == c[0]) | (test['CATEGORY'] == c[1])]
