## Load Data

In [1]:
# upload files to run on google colab
# upload news_small_with_splits.csv
# upload vocabulary.py
# upload vectorizer.py
# upload dataset.py
# upload model.py
# upload training.py
# upload trainer.py

from google.colab import files

uploaded_files = files.upload()

Saving trainer.py to trainer.py


In [2]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

from dataset import NewsDataset, generate_batches
from training import set_seed_everywhere, handle_dirs, make_train_state, update_train_state, compute_accuracy, load_glove_from_file, make_embedding_matrix
from model import NewsClassifier
from trainer import NewsClassifierTrainer

torch.cuda.is_available()

False

In [3]:
args = Namespace(
    # Data and Path hyper parameters
    news_csv="news_small_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage\document_classification",
    # Model hyper parameters
    glove_filepath='Glove/glove.6B.100d.txt',
    use_glove=False,
    embedding_size=100,
    hidden_dim=100,
    num_channels=100,
    # Training hyper parameter
    seed=1337,
    learning_rate=0.001,
    dropout_p=0.1,
    batch_size=128,
    num_epochs=100,
    early_stopping_criteria=5,
    # Runtime option
    cuda=True,
    device='cuda',
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

## Generate Model

In [4]:
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	model_storage\document_classification/vectorizer.json
	model_storage\document_classification/model.pth
Using CUDA: False


In [5]:
if args.reload_from_files:
    # training from a checkpoint
    dataset = NewsDataset.load_dataset_and_load_vectorizer(args.news_csv, args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = NewsDataset.load_dataset_and_make_vectorizer(args.news_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe or randomly initialized embeddings
if args.use_glove:
    words = vectorizer.title_vocab._token_to_idx.keys()
    embeddings = make_embedding_matrix(glove_filepath=args.glove_filepath, words=words)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = NewsClassifier(embedding_size=args.embedding_size,
                            num_embeddings=len(vectorizer.title_vocab),
                            num_channels=args.num_channels,
                            hidden_dim=args.hidden_dim,
                            num_classes=len(vectorizer.category_vocab),
                            dropout_p=args.dropout_p,
                            pretrained_embeddings=embeddings,
                            padding_idx=0)

Not using pre-trained embeddings


In [None]:
# importing the "tarfile" module
# import gzip
# import shutil

# open file
# with gzip.open('glove.6B.100d.txt.gz', 'rb') as f_in:
#    with open('./Glove/glove.6B.100d.txt', 'wb') as f_out:
#        shutil.copyfileobj(f_in, f_out)


In [6]:
# Number of Classes
len(vectorizer.title_vocab)
# vectorizer.title_vocab.lookup_index(19)

2419

## Training Loop

In [7]:
classifier = classifier.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)

text_classifier = NewsClassifierTrainer(classifier, dataset, loss_func, optimizer, scheduler, args)
text_classifier.train()
text_classifier.evaluate()

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/5 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

Test loss: 1.6849069595336914;
Test Accuracy: 39.0625


## Inference

In [8]:
# Preprocess the reviews
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [9]:
def predict_category(title, classifier, vectorizer, max_length):
    """Predict a News category for a new title

    Args:
        title (str): a raw title string
        classifier (NewsClassifier): an instance of the trained classifier
        vectorizer (NewsVectorizer): the corresponding vectorizer
        max_length (int): the max sequence length
            Note: CNNs are sensitive to the input data tensor size.
                  This ensures to keep it the same size as the training data
    """
    title = preprocess_text(title)
    vectorized_title = torch.tensor(vectorizer.vectorize(title, vector_length=max_length))
    result = classifier(vectorized_title.unsqueeze(0), apply_softmax=True)
    probability_values, indices = result.max(dim=1)
    predicted_category = vectorizer.category_vocab.lookup_index(indices.item())

    return {'category': predicted_category, 'probability': probability_values.item()}

In [10]:
def get_samples():
    samples = {}
    for cat in dataset.val_df.category.unique():
        samples[cat] = dataset.val_df.title[dataset.val_df.category==cat].tolist()[:5]
    return samples

val_samples = get_samples()

In [11]:
#title = input("Enter a news title to classify: ")
classifier = classifier.to("cpu")

for truth, sample_group in val_samples.items():
    print(f"True Category: {truth}")
    print("="*30)
    for sample in sample_group:
        prediction = predict_category(sample, classifier,
                                      vectorizer, dataset._max_seq_length + 1)
        print("Prediction: {} (p={:0.2f})".format(prediction['category'],
                                                  prediction['probability']))
        print("\t + Sample: {}".format(sample))
    print("-"*30 + "\n")

True Category: Business
Prediction: Sci/Tech (p=0.67)
	 + Sample: airbus enroute to challenge boeing s e 
Prediction: Sci/Tech (p=0.51)
	 + Sample: dollar continues slide vs euro
Prediction: Sci/Tech (p=0.62)
	 + Sample: lawsuits hold good news for investors
Prediction: Sports (p=0.57)
	 + Sample: microsoft settles disputes
Prediction: Business (p=0.58)
	 + Sample: liberty makes nice with news corp . 
------------------------------

True Category: Sci/Tech
Prediction: World (p=0.50)
	 + Sample: group unveils genetically engineered mice ap 
Prediction: Sci/Tech (p=0.94)
	 + Sample: mount st . helens spews more steam and ash reuters 
Prediction: World (p=0.72)
	 + Sample: apple rolls out ipod photo , rocks with u 
Prediction: World (p=0.46)
	 + Sample: sbc , yahoo , wire , and dish network partner for multimedia entertainment monster
Prediction: Sports (p=0.80)
	 + Sample: mp creator warns tech impasse dooming downloads
------------------------------

True Category: Sports
Prediction: Wo