<a href="https://colab.research.google.com/github/VGODIE/ML_kaggle_competitions/blob/master/news_classification_with_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score

import numpy as np

import matplotlib.pyplot as plt
import collections

import torch
from torch import nn
from torch.nn import functional as F

import re
import copy
import traceback
import datetime

In [0]:
train_source = fetch_20newsgroups(subset="train")
test_source = fetch_20newsgroups(subset="test")

In [0]:
TOKEN_RE = re.compile(r'[\w\d]+')

def tokenize_text_simple_regex(txt, min_token_size=4):
  txt = txt.lower()
  all_tokens = TOKEN_RE.findall(txt)
  return [token for token in all_tokens if len(token) >= min_token_size]

def tokenize_corpus(texts, tokenizer=tokenize_text_simple_regex):
  return [tokenizer(text) for text in texts]

In [0]:
(" ").join(tokenize_corpus([train_source["data"][0]])[0])

In [0]:
train_tokenized = tokenize_corpus(train_source["data"])

In [0]:
test_tokenized = tokenize_corpus(test_source["data"])

In [0]:
def build_vocabulary(tokenized_texts,
                     max_size=1000000,
                     max_doc_freq=0.8,
                     min_count=5,
                     pad_word=None):
  word_counts = collections.defaultdict(int)
  doc_n = 0

  for txt in tokenized_texts:
    doc_n += 1
    unique_text_tokens = set(txt)
    for token in unique_text_tokens:
      word_counts[token] += 1
  
  word_counts = {word: cnt for word, cnt in word_counts.items()
                if cnt >= min_count and cnt/doc_n <= max_doc_freq}
  sorted_word_counts = sorted(word_counts.items(),
                              reverse=True,
                              key=lambda pair: pair[1])
  if pad_word is not None:
    sorted_word_counts = [(pad_word, 0)] + sorted_word_counts

  if len(word_counts) > max_size:
    sorted_word_counts = sorted_word_counts[:max_size]

  #build vocab
  word2id = {word: i for i, (word, _) in enumerate(sorted_word_counts)}
  word2freq = np.array([cnt/doc_n for _, cnt in sorted_word_counts], dtype="float32")

  return word2id, word2freq

In [0]:
import scipy

def vectorize_texts(tokenized_texts, word2id,word2freq,  mode="tfidf", scale=True):
  result = scipy.sparse.dok_matrix((len(tokenized_texts), len(word2id)), dtype='float32')

  for i, text in enumerate(tokenized_texts):
    for token in text:
      if token in word2id:
        result[i, word2id[token]] += 1
  result = result.tocsr()
  result = result.multiply(1/result.sum(1))
  result = result.multiply(1/word2freq)

  if scale:
    result.tocsr()
    result -= result.min()
    result /= (result.max() + 1e-6)
  
  return result.tocsr()

In [0]:
MAX_DF = 0.8
MIN_COUNT = 5
vocabulary, word_doc_freq = build_vocabulary(train_tokenized,
                                             max_doc_freq=MAX_DF)

In [0]:
VECTORIZATION_MODE = "tfidf"

train_vectorized = vectorize_texts(train_tokenized, vocabulary, word_doc_freq)
test_vectorized = vectorize_texts(test_tokenized, vocabulary, word_doc_freq)


In [0]:
UNIQUE_WORDS = train_vectorized.shape[1]
UNIQUE_LABELS = len(set(train_source["target"]))

In [0]:
from torch.utils.data import Dataset

class SparseFeautureDataset(Dataset):
  def __init__(self, features, targets):
    self.features = features
    self.targets = targets
  def __len__(self):
    return self.features.shape[0]
  def __getitem__(self, idx):
    cur_features = torch.from_numpy(self.features[idx].toarray()[0]).float()
    cur_label = torch.from_numpy(np.asarray(self.targets[idx])).long()
    return cur_features, cur_label

In [0]:
ds = SparseFeautureDataset()

In [0]:
train_dataset = SparseFeautureDataset(train_vectorized, train_source["target"])
test_dataset = SparseFeautureDataset(test_vectorized, test_source["target"])

In [0]:
from torch.utils.data import DataLoader

def lr_scheduler(optim):
  return torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                    patience=5,
                                                    factor=0.5,
                                                    verbose=True)

def train_eval_loop(model, train_dataset, val_dataset, loss_func,
                    lr=1e-4, epochs=10, batch_size=32,
                    max_batches_per_epoch_train = 1000,
                    max_batches_per_epoch_val = 1000,
                    early_stopping_patience=10, l2_reg_alpha = 0,
                    data_loader=DataLoader, optimizer=None, shuffle_train=True,
                    lr_scheduler=None, device="cuda"):
  device = torch.device(device)
  model.to(device)

  if optimizer is None:
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  train_dataloader = data_loader(train_dataset, batch_size=batch_size, shuffle=shuffle_train)
  val_dataloader = data_loader(val_dataset, batch_size=batch_size, shuffle=False)

  best_val_loss = float('inf')
  best_epoch_i = 0
  best_model = copy.deepcopy(model)

  for epoch in range(epochs):
    try:
      epoch_start = datetime.datetime.now()
      print('Epoch {}'.format(epoch))

      model.train()
      mean_train_loss = 0
      train_batches = 0
      for step, (batch_x, batch_y) in enumerate(train_dataloader):
        if step > max_batches_per_epoch_train:
          break
        
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        pred = model(batch_x)
        loss = loss_func(pred, batch_y)

        model.zero_grad()
        loss.backward()
        optimizer.step()

        mean_train_loss += float(loss)
        train_batches += 1

      mean_train_loss /= train_batches
      print("Epoch: {} iterations, {:0.2f} sec".format(epoch, (datetime.datetime.now()
        - epoch_start).total_seconds()))
      print("Mean train loss =", mean_train_loss)

      model.eval()
      mean_val_loss = 0
      val_batches = 0

      with torch.no_grad():
        for step, (batch_x, batch_y) in enumerate(val_dataloader):
          if step > max_batches_per_epoch_val:
            break
        
          batch_x = batch_x.to(device)
          batch_y = batch_y.to(device)

          pred = model(batch_x)
          loss = loss_func(pred, batch_y)

          mean_val_loss += float(loss)
          val_batches += 1

        mean_val_loss /= val_batches
        print("Mean val loss =", mean_val_loss)

        if mean_val_loss < best_val_loss:
          best_epoch_i = step
          best_val_loss = mean_val_loss
          best_model = copy.deepcopy(model)
          print("New best model ! PUSHKA")
        elif step - best_epoch_i > early_stopping_patience:
          print("Model hasn't evolved within last {} epochs, training stops, wayaaa".format(
              early_stopping_patience
            ))
          break
      print()
          
        #if lr_scheduler is not None:
         # lr_scheduler.step(mean_val_loss)
    except KeyboardInterrupt:
      print("Keyboard interruption")
      break
    except Exception as ex:
      print("Failure during training: {}\n{}".format(ex, traceback.format_exc()))
      break
  return best_val_loss, best_model     

def lr_scheduler(optim):
  return torch.optim.lr_scheduler.ReduceLROnPlateau(optim,
                                                    patience=5,
                                                    factor=0.5,
                                                    verbose=True)

In [0]:
model = nn.Linear(UNIQUE_WORDS, UNIQUE_LABELS)

best_val_loss, best_model = train_eval_loop(
    model=model,
    train_dataset=train_dataset,
    val_dataset=test_dataset,
    loss_func=F.cross_entropy,
    lr=1e-1,
    epochs=10,
    batch_size=32,
    l2_reg_alpha=0,
    lr_scheduler=lr_scheduler
)