# Training a text classifier model with fastai
- this notebook assumes you have already run text_model_training.ipynb notebook
- In this notebook, the IMDB dataset is ingested
- the first section

In [1]:
#hide
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [2]:
#hide
from fastbook import *
from fastai.text.all import *

In [3]:
modifier = 'mar3'

# Ingest the dataset
- define the path for the dataset
- create a TextDataLoaders object

In [4]:
%%time
# create dataloaders object
'''dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=16)
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)'''
# LSTM have multiple dropout probabilities for different things. Once you set them, this drop_mult property scales all of them. So you can change all dropout probabilities simultaneously using this, keeping their relative size
path = untar_data(URLs.IMDB)
path.ls()

CPU times: user 3.99 ms, sys: 3 µs, total: 3.99 ms
Wall time: 4.91 ms


(#7) [Path('/storage/data/imdb/README'),Path('/storage/data/imdb/tmp_lm'),Path('/storage/data/imdb/imdb.vocab'),Path('/storage/data/imdb/tmp_clas'),Path('/storage/data/imdb/test'),Path('/storage/data/imdb/train'),Path('/storage/data/imdb/unsup')]

In [5]:
%%time
# dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=8)
# get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])
dls = TextDataLoaders.from_folder(path, valid = 'test', is_lm=True, bs=16)

CPU times: user 4.26 s, sys: 1.77 s, total: 6.03 s
Wall time: 6.06 s


# Define classifier

In [6]:
# dls definition cribbed from chapter 10
dls_clas = DataBlock(
    blocks=(TextBlock.from_folder(path, vocab=dls.vocab),CategoryBlock),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
).dataloaders(path, path=path, bs=128, seq_len=72)

In [7]:
path

Path('/storage/data/imdb')

In [8]:
%%time
# define a text_classifier_learner object
learn_clas = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, 
                                metrics=accuracy).to_fp16()

CPU times: user 6.09 s, sys: 1.07 s, total: 7.16 s
Wall time: 1.87 s


In [9]:
%%time
# set the path to the location of the encoder
learn_clas.path = Path('/notebooks/temp')

CPU times: user 114 µs, sys: 23 µs, total: 137 µs
Wall time: 42.9 µs


In [10]:
# load the encoder that was saved when the language model was trained
learn_clas = learn_clas.load_encoder('ft_'+modifier)

In [11]:
%%time
learn_clas.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.412126,0.27517,0.88624,03:04


CPU times: user 2min 7s, sys: 56.9 s, total: 3min 4s
Wall time: 3min 4s


In [12]:
%%time
learn_clas.freeze_to(-2)
learn_clas.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.252515,0.200414,0.92344,03:30


CPU times: user 2min 25s, sys: 1min 4s, total: 3min 29s
Wall time: 3min 30s


In [13]:
preds = learn_clas.predict("this film shows incredibly bad writing and is a complete disaster")

In [14]:
preds

('neg', TensorText(0), TensorText([9.9997e-01, 2.6378e-05]))

In [15]:
preds = learn_clas.predict("what a wonderful film")

In [16]:
preds

('pos', TensorText(1), TensorText([0.0054, 0.9946]))

In [17]:
learn_clas.save('classifier_single_epoch_'+modifier+'b')

Path('/notebooks/temp/models/classifier_single_epoch_mar3b.pth')