# Training a text classifier model with fastai
- this notebook assumes you have already run text_model_training.ipynb notebook
- In this notebook, the IMDB dataset is ingested
- the first section

In [1]:
#hide
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [2]:
#hide
from fastbook import *
from fastai.text.all import *

In [3]:
# switch to control whether direct TDL or DataBlocks definition used 
tdl = True

In [4]:
# ensure the modifier value matches the value set for modifier in text_model_training notebook
modifier = "july4_2021"

# Ingest the dataset
- define the path for the dataset
- create a TextDataLoaders object

In [5]:
%%time
# create dataloaders object
path = untar_data(URLs.IMDB)
path.ls()

CPU times: user 2.86 ms, sys: 822 µs, total: 3.68 ms
Wall time: 11.8 ms


(#7) [Path('/storage/data/imdb/README'),Path('/storage/data/imdb/tmp_lm'),Path('/storage/data/imdb/imdb.vocab'),Path('/storage/data/imdb/tmp_clas'),Path('/storage/data/imdb/test'),Path('/storage/data/imdb/train'),Path('/storage/data/imdb/unsup')]

# Define the text classifier

In [6]:
%%time
# define TextDataLoaders object
dls_clas = TextDataLoaders.from_folder(path, valid='test')


CPU times: user 7.08 s, sys: 6.01 s, total: 13.1 s
Wall time: 43.7 s


In [7]:
# directory structure of the IMDB curated dataset
'''
├── test
│   ├── neg
│   └── pos
├── tmp_clas
├── tmp_lm
├── train
│   ├── neg
│   └── pos
└── unsup
'''

'\n├── test\n│   ├── neg\n│   └── pos\n├── tmp_clas\n├── tmp_lm\n├── train\n│   ├── neg\n│   └── pos\n└── unsup\n'

In [8]:
dls_clas.path

Path('/storage/data/imdb')

In [9]:
dls_clas.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero",pos
1,"xxbos * ! ! - xxup spoilers - ! ! * \n\n xxmaj before i begin this , let me say that i have had both the advantages of seeing this movie on the big screen and of having seen the "" authorized xxmaj version "" of this movie , remade by xxmaj stephen xxmaj king , himself , in 1997 . \n\n xxmaj both advantages made me appreciate this version of "" the xxmaj shining , "" all the more . \n\n xxmaj also , let me say that xxmaj i 've read xxmaj mr . xxmaj king 's book , "" the xxmaj shining "" on many occasions over the years , and while i love the book and am a huge fan of his work , xxmaj stanley xxmaj kubrick 's retelling of this story is far more compelling … and xxup scary . \n\n xxmaj kubrick",pos
2,"xxbos xxmaj heavy - handed moralism . xxmaj writers using characters as mouthpieces to speak for themselves . xxmaj predictable , plodding plot points ( say that five times fast ) . a child 's imitation of xxmaj britney xxmaj spears . xxmaj this film has all the earmarks of a xxmaj lifetime xxmaj special reject . \n\n i honestly believe that xxmaj jesus xxmaj xxunk and xxmaj julia xxmaj xxunk set out to create a thought - provoking , emotional film on a tough subject , exploring the idea that things are not always black and white , that one who is a criminal by definition is not necessarily a bad human being , and that there can be extenuating circumstances , especially when one puts the well - being of a child first . xxmaj however , their earnestness ends up being channeled into preachy dialogue and trite",neg


In [10]:
# save the current path
keep_path = path
print("keep_path is: ",str(keep_path))

keep_path is:  /storage/data/imdb


In [11]:
%%time
# define a text_classifier_learner object
learn_clas = text_classifier_learner(dls_clas, AWD_LSTM, 
                                metrics=accuracy).to_fp16()

CPU times: user 6.89 s, sys: 1.16 s, total: 8.05 s
Wall time: 5.06 s


# Fine-tune the text classifier
Use the encoder created as part of training the language model to fine tune the text classifier

In [11]:
# Path('/storage/data/imdb')
learn_clas.path

Path('/storage/data/imdb')

In [12]:
%%time
# set the path to the location of the encoder
learn_clas.path = Path('/notebooks/temp')

CPU times: user 339 µs, sys: 80 µs, total: 419 µs
Wall time: 54.4 µs


In [13]:
# load the encoder that was saved when the language model was trained
learn_clas = learn_clas.load_encoder('ft_'+modifier)

In [14]:
learn_clas.path

Path('/notebooks/temp')

In [15]:
# set the path back to the original path
learn_clas.path = keep_path

In [16]:
# ch 10 style Path('/storage/data/imdb')
learn_clas.path

Path('/storage/data/imdb')

In [None]:
%%time
# fine tune the model
learn_clas.fit_one_cycle(5, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.431991,0.294341,0.8766,08:29
1,0.4096,0.281576,0.8838,04:39
2,0.396112,0.270241,0.88888,04:10
3,0.411736,0.263737,0.891,05:10


In [None]:
x, y = first(dls_clas.train)
x.shape, y.shape, len(dls_clas.train)

In [None]:
learn_clas.summary()

# Exercise the text classifier
Apply the fine-tuned text classifier on some text samples.

In [None]:
preds = learn_clas.predict("this film shows incredibly bad writing and is a complete disaster")

In [None]:
preds

In [None]:
preds = learn_clas.predict("this film shows incredible talent and is a complete triumph")

In [None]:
preds

In [None]:
# save the classifier model
learn_clas.path = Path('/notebooks/temp')
learn_clas.save('classifier_single_epoch_'+modifier+'d')