<a href="https://colab.research.google.com/github/OmarAlsaqa/TextClassification/blob/master/Text_Classification_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install torch<=1.2.0
#!pip install torchtext
!pip install torchtext==0.4
%matplotlib inline

/bin/bash: =1.2.0: No such file or directory


In [0]:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 2
import os
if not os.path.isdir('./.data'):
	os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=NGRAMS, vocab=None)
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

120000lines [00:09, 13165.53lines/s]
120000lines [00:18, 6541.99lines/s]
7600lines [00:01, 6889.99lines/s]


In [0]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [0]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [0]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]

    
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [0]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()
    
    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [0]:

import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 28 seconds
	Loss: 0.0262(train)	|	Acc: 84.8%(train)
	Loss: 0.0001(valid)	|	Acc: 89.3%(valid)
Epoch: 2  | time in 0 minutes, 28 seconds
	Loss: 0.0118(train)	|	Acc: 93.7%(train)
	Loss: 0.0000(valid)	|	Acc: 90.8%(valid)
Epoch: 3  | time in 0 minutes, 28 seconds
	Loss: 0.0069(train)	|	Acc: 96.4%(train)
	Loss: 0.0000(valid)	|	Acc: 90.4%(valid)
Epoch: 4  | time in 0 minutes, 28 seconds
	Loss: 0.0038(train)	|	Acc: 98.1%(train)
	Loss: 0.0000(valid)	|	Acc: 90.7%(valid)
Epoch: 5  | time in 0 minutes, 28 seconds
	Loss: 0.0022(train)	|	Acc: 99.0%(train)
	Loss: 0.0000(valid)	|	Acc: 91.0%(valid)


In [0]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0002(test)	|	Acc: 90.7%(test)


In [0]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = "Lift-off went smoothly, with the Atlas V rocket soaring from Florida just before sunrise and heading for the International Space Station, where it is due to arrive tomorrow.But just half an hour into its journey, Boeing reported that the capsule's insertion into orbit was not normal.Flight controllers are understood to be considering their options and insist the capsule is in a stable orbit.Inside, is a test dummy named Rosie, which is sitting in the commander's seat.Rosie has been dressed in a red polka-dot bandana and a royal blue spacesuit. It is named after the bicep-flexing riveter on Second World War posters."
#ex_text_str = "The church, built on the spot where Jesus is believed to have been born, was at imminent risk of collapse six years ago but has now been saved and is safe for generations to come.It is very sensitive and it was great to work here, project manager Afif Tweme says as he guides me around the building."
#ex_text_str = "Arteta left his post as Manchester City assistant manager, working under Pep Guardiola, last week to succeed Unai Emery as Arsen"
#ex_text_str = "Non-political posts went up in the normal way. Was this Conservative censorship? To test that theory, Mr See ran an experiment.Don't vote for Jeremy Corbyn, he posted. Then: Make sure you vote.Neither post was published. Perhaps, he thought, it wasnt the content of the posts that was the problem, but the word vote."
vocab = train_dataset.get_vocab()
model = model.to("cpu")

print("This is a %s news" %ag_news_label[predict(ex_text_str, model, vocab, 2)])

This is a Sci/Tec news
