In [1]:
%matplotlib inline
from fastai import *
from fastai.text import *
from fastai.callbacks import *

In [2]:
import numpy as np
from pathlib import Path
import os
import torch.optim as optim
import random

# transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig

# our models of choice
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig

In [3]:
import fastai
import transformers
print('fastai version :', fastai.__version__)
print('transformers version :', transformers.__version__)

fastai version : 1.0.59
transformers version : 2.2.2


### Transformers model Zoo
Create a dictionary of parameters required for creating different model classes

In [4]:
MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)
}

### Make model selection here

In [5]:
# Parameters
seed = 42
use_fp16 = False
bs = 16

model_type = 'roberta'
pretrained_model_name = 'roberta-base'

# model_type = 'bert'
# pretrained_model_name='bert-base-uncased'

# model_type = 'distilbert'
# pretrained_model_name = 'distilbert-base-uncased'

#model_type = 'xlm'
#pretrained_model_name = 'xlm-clm-enfr-1024'

#model_type = 'xlnet'
#pretrained_model_name = 'xlnet-base-cased'

In [6]:
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [7]:
model_class.pretrained_model_archive_map.keys()

dict_keys(['roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', 'roberta-large-openai-detector'])

### Use the same randomization seed so as to compare different models more easily

In [8]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False
        
seed_all(seed)

### Data pre-processing
The important thing here is that `FastAI` uses **processors** to perform repeatetive tasks when creating `DataBunch`. A set of default **processors** are performed for fastai.textlearners. For example:
  

In [None]:
# FastAI use various processors to perform repeatative tasks in data pipeline

def _get_processor(tokenizer:Tokenizer=None, vocab:Vocab=None, chunksize:int=10000, max_vocab:int=60000,
                   min_freq:int=2, mark_fields:bool=False, include_bos:bool=True, include_eos:bool=False):
    return [TokenizeProcessor(tokenizer=tokenizer, chunksize=chunksize, 
                              mark_fields=mark_fields, include_bos=include_bos, include_eos=include_eos),
            NumericalizeProcessor(vocab=vocab, max_vocab=max_vocab, min_freq=min_freq)]

class NumericalizeProcessor(PreProcessor):
    "`PreProcessor` that numericalizes the tokens in `ds`."
    def __init__(self, ds:ItemList=None, vocab:Vocab=None, max_vocab:int=60000, min_freq:int=3):
        vocab = ifnone(vocab, ds.vocab if ds is not None else None)
        self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq

    def process_one(self,item): return np.array(self.vocab.numericalize(item), dtype=np.int64)
    def process(self, ds):
        if self.vocab is None: self.vocab = Vocab.create(ds.items, self.max_vocab, self.min_freq)
        ds.vocab = self.vocab
        super().process(ds)

#### We need to overwrite the "tokenizer" and "numericalizer" in order to tailor the databunch creating process to the `Transformers`
Later when creating `DataBunch`, we are going to pass in our customized processors

## Tokenizer
Step 1: create a new tokenizer that inherit from `fastai`'s `BaseTokenizer`

In [9]:
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
        return [CLS] + tokens + [SEP]

Step 2: Get the 'pre-trained tokenizer' from `transformer` library.(this is used for initialization of the class we created above)

In [10]:
transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)

Step 3: Create the actual **tokenizer** for our transformer of choice and put a `FastAI` wrapper on

In [11]:
Tokenizer

fastai.text.transform.Tokenizer

In [12]:
model_type

'roberta'

In [13]:
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

In this implementation, be carefull about 3 things :

1. As we are not using RNN, we have to limit the sequence length to the model input size.
2. Most of the models require special tokens placed at the beginning and end of the sequences.
3. Some models like RoBERTa require a space to start the input string. For those models, the encoding methods should be called with add_prefix_space set to True.

Below, you can find the resume of each pre-process requirement for the 5 model types used in this tutorial. You can also find this information on the [HuggingFace documentation](https://huggingface.co/transformers/) in each model section.<br>
`bert:       [CLS] + tokens + [SEP] + padding`<br>
`roberta:    [CLS] + prefix_space + tokens + [SEP] + padding`<br>
`distilbert: [CLS] + tokens + [SEP] + padding`<br>
`xlm:        [CLS] + tokens + [SEP] + padding`<br>
`xlnet:      padding + [CLS] + tokens + [SEP]`

It is worth noting that we don't add padding in this part of the implementation.  As we will see later, fastai manage it automatically during the creation of the `DataBunch`.

## Numericalizer
In `fastai` library, `NumericalizeProcessor` object takes as `bocab` argument a `Vocab` object. Here we will create a new class `TransformerVocab` that inherits from `Vocab` and overwrite `numericalize` and `textify` functions.

In [14]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)

### Customize numericalize processor
Notice taht we need to pass the `include_bs=False` and `include_eos=False` options. This is because `fastiai` adds its own special tokens by default which interferes with the `[CLS]` and `[SEP]` tokens that are required for RoBERTa

In [15]:
NumericalizeProcessor

fastai.text.data.NumericalizeProcessor

In [16]:
# use RoBERTa tokenizer to initialize the `TransformersVocab
transformer_vocab = TransformersVocab(tokenizer=transformer_tokenizer)

# create a customized numericalizor using vocab just created
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)


## Put together our customized processor

In [17]:
TokenizeProcessor

fastai.text.data.TokenizeProcessor

In [18]:
# wrapper function over the tokenizer we created above
tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, include_bos=False, include_eos=False)


In [43]:
# finally put the two processors in a list for later use
transformer_processor = [OpenFileProcessor(),tokenize_processor, numericalize_processor]

# Create DataBunch
For the DataBunch creation, you have to pay attention to set the processor argument to our new custom processor `transformer_processor` and manage correctly the padding.

As mentioned in the [HuggingFace documentation](https://huggingface.co/transformers/), BERT, RoBERTa, XLM and DistilBERT are models with absolute position embeddings, so it's usually advised to pad the inputs on the right rather than the left. Regarding XLNET, it is a model with relative position embeddings, therefore, you can either pad the inputs on the right or on the left.

In [44]:
# If we are using Google XLNet
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

In [45]:
tokens = transformer_tokenizer.tokenize('the cat didnt jump onto the table, because its tired')
print(tokens)
ids = transformer_tokenizer.convert_tokens_to_ids(tokens)
print(ids)
transformer_tokenizer.convert_ids_to_tokens(ids)

['the', 'Ġcat', 'Ġdidnt', 'Ġjump', 'Ġonto', 'Ġthe', 'Ġtable', ',', 'Ġbecause', 'Ġits', 'Ġtired']
[627, 4758, 46405, 3704, 2500, 5, 2103, 6, 142, 63, 7428]


['the',
 'Ġcat',
 'Ġdidnt',
 'Ġjump',
 'Ġonto',
 'Ġthe',
 'Ġtable',
 ',',
 'Ġbecause',
 'Ġits',
 'Ġtired']

## Prepare the full IMDB dataset and inspect the files under that path

In [46]:
path = untar_data(URLs.IMDB)
path

PosixPath('/home/projectx/.fastai/data/imdb')

In [47]:
path.ls()

[PosixPath('/home/projectx/.fastai/data/imdb/imdb.vocab'),
 PosixPath('/home/projectx/.fastai/data/imdb/models'),
 PosixPath('/home/projectx/.fastai/data/imdb/lm_databunch_RoBERTa'),
 PosixPath('/home/projectx/.fastai/data/imdb/test'),
 PosixPath('/home/projectx/.fastai/data/imdb/imdb_textlist_classifier'),
 PosixPath('/home/projectx/.fastai/data/imdb/tmp_lm'),
 PosixPath('/home/projectx/.fastai/data/imdb/train'),
 PosixPath('/home/projectx/.fastai/data/imdb/lm_databunch'),
 PosixPath('/home/projectx/.fastai/data/imdb/tmp_clas'),
 PosixPath('/home/projectx/.fastai/data/imdb/README'),
 PosixPath('/home/projectx/.fastai/data/imdb/unsup')]

In [48]:
(path/'train').ls()

[PosixPath('/home/projectx/.fastai/data/imdb/train/unsupBow.feat'),
 PosixPath('/home/projectx/.fastai/data/imdb/train/pos'),
 PosixPath('/home/projectx/.fastai/data/imdb/train/labeledBow.feat'),
 PosixPath('/home/projectx/.fastai/data/imdb/train/neg')]

In [49]:
(path/'test').ls()

[PosixPath('/home/projectx/.fastai/data/imdb/test/pos'),
 PosixPath('/home/projectx/.fastai/data/imdb/test/labeledBow.feat'),
 PosixPath('/home/projectx/.fastai/data/imdb/test/neg')]

# Create DataBunch for classification

In [50]:
transformer_vocab

<__main__.TransformersVocab at 0x7f16291422e8>

In [51]:
# we can play around with the batch_size as long as the GPU can take it
bs = 48
data_clas = (TextList.from_folder(path, vocab=transformer_vocab, processor=transformer_processor)                         # specify the path
           .filter_by_folder(include=['train','test'])# exclude other folders
           .split_by_folder(valid='test')                 #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
           .label_from_folder(classes=['neg', 'pos'])                    #label them all with their folders
           .databunch(bs=bs))                                 # convert to databunch for the learner later

In [54]:
print('[CLS] token :', transformer_tokenizer.cls_token)
print('[SEP] token :', transformer_tokenizer.sep_token)
print('[PAD] token :', transformer_tokenizer.pad_token)
data_clas.show_batch()

[CLS] token : <s>
[SEP] token : </s>
[PAD] token : <pad>


text,target
"<s> ĠA Ġgroup Ġof Ġmodel - caliber ĠSan ĠFrancisco Ġwomen Ġwho Ġhave Ġbeen Ġfriends Ġsince Ġelementary Ġschool Ġare Ġsuddenly Ġbeing Ġthreatened Ġand Ġattacked Ġby Ġsomeone Ġsending Ġthem Ġbizarre ĠValentine 's ĠDay Ġcards . ĠWho Ġis Ġthe Ġkiller Ġand Ġwhy Ġis Ġthe Ġkiller Ġafter Ġthem ? Ġ< br Ġ/ >< br Ġ/> My Ġrating Ġwill Ġoften Ġchange Ġon Ġsubsequent Ġview ings Ġof Ġa Ġfilm -- sometimes Ġslightly Ġup , Ġsometimes Ġslightly",pos
"<s> ĠPerspective Ġis Ġa Ġgood Ġthing . ĠSince Ġthe Ġrelease Ġof Ġ"" Star ĠWars ĠEpisode ĠI : ĠThe ĠPhantom ĠMen ace "", Ġclaims Ġand Ġcounter - claim s Ġof Ġjust Ġhow ĠEpisode 's ĠII Ġand ĠIII Ġwill Ġevent uate Ġhas Ġtaken Ġthe Ġspotlight Ġoff Ġthe Ġ' original ' ĠStar ĠWars Ġfilms , Ġmaking Ġthem Ġpart Ġof Ġa Ġcohesive Ġwhole , Ġrather Ġthan Ġsegreg ating Ġthe Ġolder Ġand Ġnew Ġfilms Ġinto",pos
"<s> ĠBo ĠDerek Ġmight Ġhave Ġhad Ġa Ġcareer Ġhad Ġshe Ġnot Ġlet Ġher Ġlate Ġhusband , ĠJohn , Ġtake Ġover Ġas Ġher Ġdirector . ĠIt 's Ġa Ġreal Ġshame , Ġno Ġreally , Ġwith Ġthe Ġright Ġdirection Ġand Ġthe Ġright Ġpart Ġ( see Ġ"" 10 ""), ĠBo Ġwas Ġokay . ĠShe Ġwouldn 't Ġwin Ġany Ġawards Ġeven Ġat Ġher Ġbest , Ġbut Ġshe Ġis Ġno Ġworse Ġthan Ġmany Ġan Ġactress",neg
"<s> Ġ"" Fox es "" Ġis Ġa Ġserious Ġlook Ġat Ġthe Ġconsequences Ġof Ġgrowing Ġup Ġtoo Ġfast Ġin Ġthe Ġ1980 s . ĠAnd Ġunlike Ġthe Ġteen Ġsex Ġcomed ies Ġthat Ġovershadowed Ġit Ġ( P ork y 's , ĠFast ĠTimes Ġat ĠRid gement ĠHigh ), Ġthe Ġmovie Ġholds Ġup Ġwell Ġagainst Ġtime .< br Ġ/ >< br Ġ/> Its Ġtheme Ġof Ġteen Ġangst Ġis Ġas Ġrelevant Ġtoday Ġas Ġit Ġwas",pos
"<s> ĠSp oil ers ? ĠMaybe Ġa Ġfew Ġdetails , Ġbut Ġnothing Ġtoo Ġplot Ġrelated . ĠNot Ġlike Ġit Ġwould Ġmatter Ġwith Ġthis Ġmovie . Ġ< br Ġ/ >< br Ġ/> Air ĠRage Ġblatantly Ġr ips Ġoff Ġthe Ġmid - air Ġinfiltration Ġpremise Ġof ĠExecutive ĠDecision . ĠIce - T Ġleads Ġa Ġteam Ġof Ġfour Ġ"" el ite "" Ġcommand os Ġwho Ġwear Ġbag gy Ġblack Ġshirts Ġthat Ġwe Ġcan",neg


Check numericalization

In [53]:
print('[CLS] id :', transformer_tokenizer.cls_token_id)
print('[SEP] id :', transformer_tokenizer.sep_token_id)
print('[PAD] id :', pad_idx)
test_one_batch = data_clas.one_batch()[0]
print('Batch shape : ',test_one_batch.shape)
print(test_one_batch)

[CLS] id : 0
[SEP] id : 2
[PAD] id : 1
Batch shape :  torch.Size([48, 512])
tensor([[    0,    38,   218,  ...,    18,  1086,     2],
        [    0,   152,    16,  ...,  4275,    31,     2],
        [    0, 12897,   763,  ...,   463,    14,     2],
        ...,
        [    0, 16774,    35,  ...,    39,   865,     2],
        [    0,  6918, 29273,  ...,    42,  1569,     2],
        [    0,   590,     5,  ...,     4,   407,     2]])


In [55]:
# defining our model architecture 
class CustomTransformerModel(nn.Module):
    def __init__(self, transformer_model: PreTrainedModel):
        super(CustomTransformerModel,self).__init__()
        self.transformer = transformer_model
        
    def forward(self, input_ids, attention_mask=None):
        
        #attention_mask = (input_ids!=1).type(input_ids.type()) # Test attention_mask for RoBERTa
        
        logits = self.transformer(input_ids,
                                attention_mask = attention_mask)[0]   
        return logits

In [56]:
config = config_class.from_pretrained(pretrained_model_name)
config.num_labels = 2
config.use_bfloat16 = use_fp16
print(config)

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "use_bfloat16": false,
  "vocab_size": 50265
}



In [57]:
transformer_model = model_class.from_pretrained(pretrained_model_name, config = config)
# transformer_model = model_class.from_pretrained(pretrained_model_name, num_labels = 5)

custom_transformer_model = CustomTransformerModel(transformer_model = transformer_model)

## Load the encoder's pre-trained weights from the language model training

In [61]:
name = 'learn_lm_encoder_IMDB'
custom_transformer_model.transformer.roberta.load_state_dict(torch.load(path/'models'/f'{name}.pth', map_location=None))

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location='cpu' to map your storages to the CPU.

In [None]:
from fastai.callbacks import *
from transformers import AdamW
from functools import partial

CustomAdamW = partial(AdamW, correct_bias=False)

learner = Learner(data_clas, 
                  custom_transformer_model, 
                  opt_func = CustomAdamW, 
                  metrics=[accuracy, error_rate])

# Show graph of learner stats and metrics after each epoch.
learner.callbacks.append(ShowGraph(learner))

# Put learn in FP16 precision mode. --> Seems to not working
if use_fp16: learner = learner.to_fp16()

In [None]:
print(learner.model)

In [None]:
# For roberta-base
list_layers = [learner.model.transformer.roberta.embeddings,
              learner.model.transformer.roberta.encoder.layer[0],
              learner.model.transformer.roberta.encoder.layer[1],
              learner.model.transformer.roberta.encoder.layer[2],
              learner.model.transformer.roberta.encoder.layer[3],
              learner.model.transformer.roberta.encoder.layer[4],
              learner.model.transformer.roberta.encoder.layer[5],
              learner.model.transformer.roberta.encoder.layer[6],
              learner.model.transformer.roberta.encoder.layer[7],
              learner.model.transformer.roberta.encoder.layer[8],
              learner.model.transformer.roberta.encoder.layer[9],
              learner.model.transformer.roberta.encoder.layer[10],
              learner.model.transformer.roberta.encoder.layer[11],
              learner.model.transformer.roberta.pooler]

In [None]:
learner.split(list_layers)
num_groups = len(learner.layer_groups)
print('Learner split in',num_groups,'groups')
print(learner.layer_groups)

In [None]:
learner.save('untrained_classifier')

In [None]:
# learner.load('untrained_classifier')

In [None]:
learner.freeze_to(-1)
learner.lr_find()

In [None]:
learner.recorder.plot(skip_end = 10, suggestion=True)

In [None]:
learner.fit_one_cycle(5, 5e-4,moms=(0.8,0.7))

## Data for LM training
The reviews are in a training and test folder. In addition, we have a 'unsup' folder that contains many reviews are 
are **not labelled**.<br>
In the second training phase of ULMFiT, as mentioned in the [README](https://github.com/Sylar257/Sentiment-Analysis/blob/master/README.md)
, we will be fine-tuning the language model to the domain-specific corpus.<br>
Thus, training could take advantage of using 'train', 'test', and 'unsup' folder.

### Create a DataBunch for language model
What does that mean?<br>
Basically we are creating a databunch for our learner later in a way that we ignore the true label('negataive' or  'positive'). Hence, the task we constructed is as simple as to just predict the **next word** without worrying about what the sentiment.

In [None]:
# we can play around with the batch_size as long as the GPU can take it
bs = 48
data_lm = (TextList.from_folder(path, processor=transformer_processor)                         # specify the path
           .filter_by_folder(include=['train','test','unsup'])# exclude other folders
           .split_by_rand_pct(0.1, seed=seed)                 # randomly split and keep 10% for validation set
           .label_for_lm()                                    # label as to "predict the next word token"
           .databunch(bs=bs))                                 # convert to databunch for the learner later

In [None]:
len(data_lm.train_ds), len(data_lm.valid_ds)

In [None]:
# take a peek at a small sample of data
data_lm.show_batch()

In [None]:
# Let's save our databunch so that we don't have to re-run this the next time
data_lm.save('lm_databunch_RoBERTa')

In [None]:
# # load the databunch we created ealier
# bs = 48
# data_lm = load_data(path, 'lm_databunch_RoBERTa', bs=bs)

In [None]:
# a sanity check for <s>; </s>; <pad>
print('[CLS] token :', transformer_tokenizer.cls_token)
print('[SEP] token :', transformer_tokenizer.sep_token)
print('[PAD] token :', transformer_tokenizer.pad_token)
data_lm.show_batch()

In [None]:
print('[CLS] id :', transformer_tokenizer.cls_token_id)
print('[SEP] id :', transformer_tokenizer.sep_token_id)
print('[PAD] id :', pad_idx)
test_one_batch = data_lm.one_batch()[0]
print('Batch shape : ',test_one_batch.shape)
print(test_one_batch)

## Customize model
As mentioned [here](https://github.com/huggingface/transformers#models-always-output-tuples), every model's forward method always outputs a tuple with various elements depending on the model and the configuration parameters. In our case, we are interested to access only to the logits.  One way to access them is to create a custom model.

In [None]:
# defining our model architecture 
class CustomTransformerModel_Encoder(nn.Module):
    def __init__(self, transformer_model: PreTrainedModel):
        super(CustomTransformerModel_Encoder,self).__init__()
        self.transformer = transformer_model.roberta
        
    def forward(self, input_ids, attention_mask=None):
        
        #attention_mask = (input_ids!=1).type(input_ids.type()) # Test attention_mask for RoBERTa
        
        logits = self.transformer(input_ids,
                                attention_mask = attention_mask)[0]   
        return ([logits],[logits])
    # this function is added because fastai `learner.lr_find()` will call it
    def reset(self):
        pass

Download the pretrained weights for RoBERTa

In [None]:
transformer_tokenizer.vocab_size

In [None]:
# this is the configuration of language model
config = config_class.from_pretrained(pretrained_model_name)
config.num_labels = transformer_tokenizer.vocab_size
config.use_bfloat16 = use_fp16
print(config)

In [None]:
transformer_model = model_class.from_pretrained(pretrained_model_name, config = config)
# transformer_model = model_class.from_pretrained(pretrained_model_name, num_labels = 5)

custom_transformer_model = CustomTransformerModel_Encoder(transformer_model = transformer_model)

In [None]:
from fastai.callbacks import *
from transformers import AdamW
from functools import partial

CustomAdamW = partial(AdamW, correct_bias=False)

In [None]:
custom_transformer_model

In [None]:
def split(model, split_on:SplitFuncOrIdxList)->None:
    "Split the model at `split_on`."
    if isinstance(split_on,Callable): split_on = split_on(self.model)
    layer_groups = split_model(model, split_on)
    return layer_groups

In [None]:
# For roberta-base
list_layers = [custom_transformer_model.transformer.embeddings]

In [None]:
encoder = split(custom_transformer_model,list_layers)

In [None]:
encoder

In [None]:
enc = encoder[0][0]
enc

In [None]:
# Re-construct decoder using FastAI LinearDecoder

decoder = LinearDecoder(transformer_tokenizer.vocab_size, 
                        n_hid=768, 
                        output_p=0.1,
                        tie_encoder=enc,
                        bias=True
                       )

In [None]:
decoder

In [None]:
model = SequentialRNN(custom_transformer_model, decoder)
model

In [None]:
doc(split)

In [None]:
learn_lm = LanguageLearner(data_lm, 
                           model,
#                            split_func=tfmer_lm_split,
                           opt_func = CustomAdamW)
learn_lm.callbacks.append(ShowGraph(learn_lm))

# Put learn in FP16 precision mode. --> Seems to not working
# if use_fp16: learn_lm = learn_lm.to_fp16()

In [None]:
learn_lm.model[1]

In [None]:
# For roberta-language-model
list_layers = [learn_lm.model[0],
              learn_lm.model[1]]

In [None]:
learn_lm.split(list_layers)
num_groups = len(learn_lm.layer_groups)
print('Learner split in',num_groups,'groups')
print(learn_lm.layer_groups)

### Now the model is split into two part Encoder and classifier which enables us to save just the encoder weights later

In [None]:
data_lm.one_batch()

In [None]:
torch.cuda.empty_cache()

In [None]:
# learn_lm.freeze()
learn_lm.lr_find()
learn_lm.recorder.plot(skip_end=10,suggestion=True)

In [None]:
lr = 1e-5
learn_lm.fit_one_cycle(5, lr, moms=(0.8, 0.7))

In [None]:
learn_lm.model

## Get the encoder of the language model and save it's encoder

In [None]:
encoder = get_model(learn_lm.model)[0].transformer

In [None]:
name = 'learn_lm_encoder_IMDB'
torch.save(encoder.state_dict(), learn_lm.path/learn_lm.model_dir/f'{name}.pth')

In [None]:
(path/'models').ls()

# Old ULMFiT code for reference

In [None]:
# the drop_mult decide the percentage of dropout to use in relation to the combination used in the original paper
learn_lm_AWD = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)

### Vocabulary
As we can see below, when we create `TextDataBunch` a  `vocab` property is created simultaneously. <br>
It's worth pointing out that, for `index-to-strings` and `strings-to-index` we have a different length. This is primarily because that there are some words that appear infrequently so that it's not effcient for them to occupy one token. What we do is to map all of these low frequency words to `xxunk`. We can ditermine the `min_freq` in the following function

In [None]:
vocab = data_lm.vocab

In [None]:
vocab.stoi["stingray"]

In [None]:
vocab.stoi["happy"]

In [None]:
vocab.itos[vocab.stoi['mamamia']]

In [None]:
len(learn_lm.model)

In [None]:
# ourmodel has two parts, the AWD_LSTM base architecture and the linear classifier
learn_lm.model

#### Comments on `drop_mult`
From the original [AWD_LSTM paper](https://arxiv.org/pdf/1708.02182.pdf) the authors applied several effective techniques.<br>

**DropConnect** was implemented in the architecture in that weight matrices are *dropped* before the *forward* and *backward* pass.<br>

**Variational dropout**. In standard dropout, a new binary dropout mask is sampled each and every time the dropout function is called. **Variational dropout** only samples a *dropout mask* upon the first call and then will repeatedly use that *locked dropout* mask for all repeated connections within the forward and backward pass.<br>

The values used for *dropout on the word vectors*, the *output between LSTM layers*, the *output of the final LSTM layer*, and *embedding dropout* were (0.4, 0.3, 0.4, 0.1), respectively.

### Encoder
Encoder is the first layer of the AWD_LSTM, which is also known as the `embedding layer`

In [None]:
enc = learn_lm.model[0].encoder
enc.weight.size()  # 400 is the embedding size

In [None]:
# recall that 60000 is the vocab size of our language model vocabulary
len(data_lm.vocab.itos)

## Before training
Let's try to use this model(only pre-trained with wiki-text) to generate fake movie reviews

In [None]:
TEXT = 'The color of the sky is'
N_WORDS = 40
N_SENTENCES = 2

In [None]:
# Note that the 'temperature' denote the randomness we adopt when pick next words
print("\n\n".join(learn_lm.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))

In [None]:
# If 'temperature'  set to super low, there will be almost no randomness
N_SENTENCES = 3
print("\n\n".join(learn_lm.predict(TEXT, N_WORDS, temperature=0.05) for _ in range(N_SENTENCES)))

In [None]:
# Change TEXT
TEXT = 'I hate this movie so much'
print("\n\n".join(learn_lm.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))

In [None]:
doc(LanguageLearner.predict)

### Fine-tuning a language model
We can use the `data_lm` object we created earlier to fine-tune a pretrained language model. 
Here we will be using a pre-trained `AWD-LSTM` architecture theat is available in FastAI.

In [None]:
learn_lm.lr_find()

In [None]:
# The plot function attached to lr_finder
# Note that we get a smoothened version where `skip_start` and `skip_end` decide howmuch to trim-off from start or end

def plot(self, skip_start:int=10, skip_end:int=5, suggestion:bool=False, return_fig:bool=None,
             **kwargs)->Optional[plt.Figure]:
        "Plot learning rate and losses, trimmed between `skip_start` and `skip_end`. Optionally plot and return min gradient"
        lrs = self._split_list(self.lrs, skip_start, skip_end)
        losses = self._split_list(self.losses, skip_start, skip_end)
        losses = [x.item() for x in losses]
        if 'k' in kwargs: losses = self.smoothen_by_spline(lrs, losses, **kwargs)
        fig, ax = plt.subplots(1,1)
        ax.plot(lrs, losses)
        ax.set_ylabel("Loss")
        ax.set_xlabel("Learning Rate")
        ax.set_xscale('log')
        ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.0e'))
        if suggestion:
            try: mg = (np.gradient(np.array(losses))).argmin()
            except:
                print("Failed to compute the gradients, there might not be enough points.")
                return
            print(f"Min numerical gradient: {lrs[mg]:.2E}")
            ax.plot(lrs[mg],losses[mg],markersize=10,marker='o',color='red')
            self.min_grad_lr = lrs[mg]
            ml = np.argmin(losses)
            print(f"Min loss divided by 10: {lrs[ml]/10:.2E}")
        if ifnone(return_fig, defaults.return_fig): return fig
        if not IN_NOTEBOOK: plot_sixel(fig)

In [None]:
learn_lm.recorder.plot()

In [None]:
# Trim the last 15 datapoints so that we have clearer view
learn_lm.recorder.plot(skip_end=15)

We want to select a `learning_rate` where the slope is steep and not going upwards.<br>
`lr=5e-3` seems to be a sensible choice

In [None]:
lr=5e-3
learn_lm.to_fp16()
learn_lm.fit_one_cycle(5, lr, moms=(0.8,0.7))

In [None]:
learn_lm.save('fit_freezed_5_epochs')

In [None]:
learn_lm.load('fit_freezed_5_epochs')

In [None]:
# now we see that the aacuracy is positively improving
# we can unfreeze and train further
learn_lm.unfreeze()
learn_lm.fit_one_cycle(10,lr/5, moms=(0.8, 0.7))

In [None]:
learn_lm.save('fine_tuned_lm')
learn_lm.save_encoder('fine_tuned_lm_enc')

In [None]:
TEXT = "i liked this movie because"
N_WORDS = 40
N_SENTENCES = 2

In [None]:
print("\n\n".join(learn_lm.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))

In [None]:
TEXT = "This movie was"
N_WORDS = 30
N_SENTENCES = 2
print("\n\n".join(learn_lm.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))

### Now the language model is much more "*IMDB like*"

In [None]:
bs = 48
data_clas = (TextList.from_folder(path, vocab=data_lm.vocab)
             #grab all the text files in path
             .split_by_folder(valid='test')
             #split by train and valid folder (that only keeps 'train' and 'test' so no need to filter)
             .label_from_folder(classes=['neg', 'pos'])
             #label them all with their folders
             .databunch(bs=bs, num_workers=1))

In [None]:
data_clas.save('imdb_textlist_classifier')

In [None]:
data_clas.show_batch()

## Creating ultimate-model: IMDB review classifier
Three simple steps to create our review classifier after fine tuning our language model with IMDb dataset:
    1. Create a `text_classifier_learner` with `AWD_LSTM` as base archietecture(note that we can use any model here as base architecture, but it has to be the same with the one we just trained as language model. So if we want a transformer here, we would be training a transformer language model in the previous section)
    2. Load the `fine-tuned-language-model-encoder`. We have save both the encoder as well as the entire model. We would only be needing the encoder's weights here as we will be replacing the head as a freshly initialize classifier and train it later.
    3. Freeze the body(base architecture) of our model, and train just the head first. We can make use of our usual trategy be looking for the optimal learning rate.

In [None]:
learn_clas = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.3)
learn_clas.load_encoder('fine_tuned_lm_enc')
learn_clas.freeze()

In [None]:
learn_clas.lr_find()

In [None]:
learn_clas.recorder.plot()

### Here we will be turning our model to mixed-precision in order to speed up the training

In [None]:
learn_clas.to_fp16();

In [None]:
learn_clas.fit_one_cycle(10, 2e-2, moms=(0.8, 0.7))

In [None]:
learn_clas.save('learn_clas_freezed')

### Gradual unfreezing during fine-tuning our classifier
Fine-tuning the target classifier is the most critical part of the *transfer learning method*. Overly aggressive fine-tuning will cause catastrophic forgetting, eliminating the benefit of the information captured through language modeling; too cautious fine-tuning will lead to slow convergence (and resultant overfiting). Besides discriminative fine-tuning and triangular learning rates, **gradual unfreezing** is proposed.

Jeremy et al. found that for RNN-base NLP models, by gradually unfreeze the layers from head to bottom we minimize the forgetting incurred for each *transfer learning* thus maximize our effort done in the previous training section.

We first unfreeze the **last layer** and fine-tune all un-frozen layers for one epoch.

Then unfreeze the **next lower frozen layer** and repeat.

Until all unfrozen layers converges.

In [None]:
learn_clas.freeze_to(-2)
learn_clas.fit_one_cycle(5, slice(1e-2/(2.6**4),1e-2), moms=(0.8, 0.7))

In [None]:
learn_clas.save('learn_clas_unfreed_second')

In [None]:
learn_clas.freeze_to(-3)
learn_clas.fit_one_cycle(5, slice(5e-3/(2.6**4), 5e-3), moms=(0.8, 0.7))

In [None]:
learn_clas.save('learn_clas_unfreezed_third')

In [None]:
learn_clas.load('learn_clas_unfreezed_third');

In [None]:
learn_clas.unfreeze()
learn_clas.fit_one_cycle(15, slice(4e-4/(2.6**4),5e-4), moms=(0.8,0.7))

In [None]:
learn_clas.save('learn_clas_unfreezed_final')

The state-of-the-art result for IMDB sentimant classification result in 2017 is **94.1%**
What we can do even better, is to build a **reversed model** as well and training a meta-learner on top of that.
For this technique, we will be experimenting with more detail in my [sentiment analysis with non-English language]() repo.