In [3]:
#!pip install fasttext
#import fasttext as ft
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 4.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 18.6MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 30.4MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K 

## Text Classification Using Various NLP Architectures

+ This notebook shows how to solve a multi-class text classification with some of the most popular architectures in NLP

+ Including text-processing using **spaCy** as well as **Keras tokenizers**

+ The architectures used are, **Vanilla-RNN and GRU**, implemented **from scracth**, PyTorch version of **LSTM** and the fancy **Transformers model** from Huggingface library

+ The notebook focuses on implementations and approach to solve this NLP task and hyperparameter tuning won't be addressed here, although rather straight forward (in this case)

+ Following this work should allow one to,
  + Preprocess text data using Keras or spaCy or Transformers tokenizers and convert them into word embeddings
  + Use PyTorch to build 
      + custom models
      + setup an NLP text-classification problem, 
      + train and 
      + validate
  + Introduce to the huggingface library


## Mount Data on Google Drive (If running on google colab)

In [1]:
from google.colab import drive 
drive.mount('/content/drive/')
data = '/content/drive/My\ Drive/nlp/nlp_colab/'
% cd {data}

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/
/content/drive/My Drive/nlp/nlp_colab


In [13]:
%load_ext tensorboard
%load_ext autoreload
%autoreload 2
import io, random, os
import numpy as np
import pandas as pd
from pathlib import Path

# Input data and train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn import metrics 


# Custom models 
from nlp_models import *
from nlp_train import *

# Tensorboard for PyTorch
from torch.utils.tensorboard import SummaryWriter

# Preprocessing imports
import spacy
from keras.preprocessing import text, sequence

## Transfomers related imports
from transformers import pipeline
from transformers import AdamW
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer
from transformers import Trainer, TrainingArguments

#import fasttext as ft

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Set random seeds for reproducibility

In [5]:
def seed_init(seed=31415):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_init()

## Fetch the 20 newsgroups data using the scikit-learn API

In [41]:
categories = ['sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian']
newsgroups_all= fetch_20newsgroups(subset='all',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
X_train, X_test, y_train, y_test = train_test_split(newsgroups_all.data, newsgroups_all.target,
                                         test_size=0.2, stratify=newsgroups_all.target)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test,
                                         test_size=0.5, stratify=y_test)

## Set Basic Parameters

In [38]:
Nwords = 128 # Number of tokens per input
data_path=Path('../') # Data folder
seq_ln = Nwords 
emb_sz = 300 # Size of embedding vector
output_sz = 5 # Size of output = # of classes
hd_sz = 300 # Size of the hidden units

> # Tokenize and Create Embedding

In [42]:
## Keras preprocessing
# Tokenize and creat train, valid and test datasets

tokenizer = text.Tokenizer(num_words=60000)
tokenizer.fit_on_texts(list(X_train)+list(X_valid) + list(X_test))
all_words = list(tokenizer.index_word.values())

xtrain = sequence.pad_sequences(tokenizer.texts_to_sequences(X_train), padding='post', maxlen=200)
xvalid = sequence.pad_sequences(tokenizer.texts_to_sequences(X_valid), padding='post', maxlen=200)
xtest = sequence.pad_sequences(tokenizer.texts_to_sequences(X_test), padding='post', maxlen=200)

# For RNNs
train_data = mydataset(xtrain, y_train)
test = mydataset(xtest, y_test)
valid = mydataset(xvalid, y_valid)

# For transformer models

## Free Memory
#del X_valid
#del X_test
#del X_train
#del y_train
#del xtrain

In [9]:
# Spacy processing 
# If you want to use Spacy Tokenizer's instead of Keras

#tokenizer = Tokenizer()
#spc = SpacyTokenizer('en')
#wiki_words = pickle.load(open(data_path / 'itos_wt103.pkl','rb'))
#wiki_vocab = Vocab.create([wiki_words], max_vocab=60000, min_freq=1)

#def pipeline(xin, yin):
#    return mydataset([pad_zeros(wiki_vocab.numericalize(i)) for i in tokenizer.process_all(xin)], yin)

#token_train = tokenizer.process_all(X_train)

#def pad_zeros(inp, max_len=Nwords):
#    ''' pad zeros if the len(input) < max_len'''
#    if len(inp)>=max_len:
#        return inp[:max_len]
#    else:
#        return inp+[0]*(max_len-len(inp))
    
#xtrain = [pad_zeros(wiki_vocab.numericalize(i)) for i in token_train]
#word_to_ix = {word: i for i, word in enumerate(wiki_vocab.itos)}

#valid =pipeline(X_valid, y_valid)
#test = pipeline(X_test, y_test)
#train_data = mydataset(xtrain, y_train)

In [10]:
## Create Embedding vectors and dump to save time in the future

def create_emb(vecs, itos, em_sz=300, mult=1.):
    emb = nn.Embedding(len(itos), em_sz, padding_idx=1)
    wgts = emb.weight.data
    vec_dic = {w:vecs.get_word_vector(w) for w in vecs.get_words()}
    miss = []
    for i,w in enumerate(itos):
        try: wgts[i] = tensor(vec_dic[w])
        except: miss.append(w)
    return emb

#en_vecs = ft.load_model(str(('../cc.en.300.bin')))
#vocab_sz = len(wiki_vocab.itos)
#emb_enc = create_emb(en_vecs, wiki_vocab.itos)
#emb_enc = torch.load(data_path/'en_emb.pth')

#emb_enc = create_emb(en_vecs, all_words)
#torch.save(emb_enc, data_path/'en_emb_keras.pth')

vocab_sz=len(all_words)
emb_enc = torch.load(data_path/'en_emb_keras.pth')

# Train

In [11]:
default_config = dict({'lr' : 0.01,'epochs':8, 'bs':64, 'wd':0.001})

## Vanilla RNN
----

In [14]:
rnn = myArch(emb_enc, seq_ln, 64, stacks=2, CHOICE='Vanilla RNN' )
train = SimpleTrain(rnn, nn.CrossEntropyLoss())

train.train(train_data, valid, default_config, log=True, metrics=['acc'], sch='cos')

----------------------------------------------------
Epoch:	 Train_loss 	 Valid_loss	acc	
0	 1.7284 	 1.5644 	 0.2707	0.3445	
---------------------------------------------
1	 1.4494 	 1.4898 	 0.3717	0.4438	
---------------------------------------------
2	 1.3411 	 1.3666 	 0.4545	0.5534	
---------------------------------------------
3	 1.3359 	 1.2667 	 0.5071	0.6380	
---------------------------------------------
4	 1.1551 	 1.2172 	 0.5111	0.6550	
---------------------------------------------
5	 1.0894 	 1.1843 	 0.5253	0.6865	
---------------------------------------------
6	 1.1317 	 1.1701 	 0.5333	0.6923	
---------------------------------------------
7	 1.1569 	 1.1662 	 0.5313	0.6926	
---------------------------------------------


## Custom GRU

In [15]:
gru = myArch(emb_enc, seq_ln, 64, stacks=2, CHOICE='GRU' )

train = SimpleTrain(gru, nn.CrossEntropyLoss())
train.train(train_data, valid, default_config, log=True, metrics=['acc'], sch='cos')

----------------------------------------------------
Epoch:	 Train_loss 	 Valid_loss	acc	
0	 1.2146 	 1.2913 	 0.5010	0.5845	
---------------------------------------------
1	 1.1247 	 0.9621 	 0.6323	0.7259	
---------------------------------------------
2	 0.7957 	 0.8037 	 0.6970	0.8300	
---------------------------------------------
3	 0.6630 	 0.7436 	 0.7192	0.8689	
---------------------------------------------
4	 0.8300 	 0.7169 	 0.7030	0.9000	
---------------------------------------------
5	 0.5184 	 0.6966 	 0.7131	0.9177	
---------------------------------------------
6	 0.4324 	 0.6769 	 0.7313	0.9315	
---------------------------------------------
7	 0.5519 	 0.6759 	 0.7354	0.9356	
---------------------------------------------


## PyTorch LSTM

In [16]:
lstm = myArch(emb_enc, seq_ln, 64, stacks=2, CHOICE='LSTM' )

train = SimpleTrain(lstm, nn.CrossEntropyLoss())
train.train(train_data, valid, default_config, log=True, metrics=['acc'], sch='cos')

----------------------------------------------------
Epoch:	 Train_loss 	 Valid_loss	acc	
0	 1.3185 	 1.3345 	 0.4020	0.4153	
---------------------------------------------
1	 0.8590 	 1.0731 	 0.5354	0.5671	
---------------------------------------------
2	 0.4810 	 0.9017 	 0.6505	0.7479	
---------------------------------------------
3	 0.6107 	 0.9655 	 0.5919	0.7035	
---------------------------------------------
4	 0.3301 	 0.8417 	 0.6606	0.7987	
---------------------------------------------
5	 0.4272 	 0.8383 	 0.6727	0.8138	
---------------------------------------------
6	 0.2172 	 0.8587 	 0.6687	0.8184	
---------------------------------------------
7	 0.2866 	 0.9046 	 0.6586	0.8088	
---------------------------------------------


## Transformer Model

In [34]:
## Download Tokenizer and Model
# Here we use the BERT model
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 1E-05

In [43]:
## Build Dataset compatible with Huggingface API
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
model_name = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(model_name)

training_set = CustomDataset(X_train, y_train, bert_tokenizer, 128)
testing_set = CustomDataset(X_test, y_test, bert_tokenizer, 128)

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [28]:
## Implement the Bertmodel class
model = BertModel()
model.to('cuda')
print('Model Initiated')

Model Initiated


In [44]:
bert_tokenizer.encode_plus

<bound method PreTrainedTokenizerBase.encode_plus of <transformers.tokenization_bert.BertTokenizer object at 0x7fd5a4b44cc0>>

## Train

In [46]:
model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)

def train(epoch):
  for _, data in enumerate(training_loader, 0):
    ids = data['ids'].to('cuda', dtype=torch.long)
    mask = data['mask'].to('cuda', dtype=torch.long)
    token_type_ids = data['token_type_ids'].to('cuda', dtype=torch.long)
    targets = data['targets'].to('cuda', dtype=torch.long)

    outputs = model(ids, mask, token_type_ids)

    optimizer.zero_grad()
    loss = nn.CrossEntropyLoss()(outputs, targets)
    if _%100==0:
      print(f'Epoch: {epoch}, Loss: {loss.item()}')
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

for epoch in range(2):
  train(epoch)

Epoch: 0, Loss: 1.6964058876037598
Epoch: 0, Loss: 1.0832419395446777
Epoch: 0, Loss: 0.6188214421272278
Epoch: 0, Loss: 0.467561811208725
Epoch: 0, Loss: 0.5233833193778992
Epoch: 1, Loss: 0.26846471428871155
Epoch: 1, Loss: 0.759685218334198
Epoch: 1, Loss: 0.06976211071014404
Epoch: 1, Loss: 0.2792321741580963
Epoch: 1, Loss: 0.07832983136177063


## Validate the Model

In [47]:
device='cuda'
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = torch.tensor(data['targets']).to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

for epoch in range(1):
    outputs, targets = validation(epoch)
    print(len(outputs), len(targets))
    
    arg_outs = np.argmax(outputs, axis=1)
    accuracy = metrics.accuracy_score(targets, arg_outs)
    
    f1_score_micro = metrics.f1_score(targets, arg_outs, average='micro')
    f1_score_macro = metrics.f1_score(targets, arg_outs, average='macro')
    
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")


  # This is added back by InteractiveShellApp.init_path()


495 495
Accuracy Score = 0.8747474747474747
F1 Score (Micro) = 0.8747474747474747
F1 Score (Macro) = 0.8756885392819788
