# Training a text classifier model with fastai
- this notebook assumes you have already run text_model_training.ipynb notebook
- In this notebook, the IMDB dataset is ingested
- the first section

In [1]:
#hide
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [2]:
#hide
from fastbook import *
from fastai.text.all import *

In [3]:
modifier = 'mar3'

# Ingest the dataset
- define the path for the dataset
- create a TextDataLoaders object

In [4]:
%%time
# create dataloaders object
'''dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=16)
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)'''
# LSTM have multiple dropout probabilities for different things. Once you set them, this drop_mult property scales all of them. So you can change all dropout probabilities simultaneously using this, keeping their relative size
path = untar_data(URLs.IMDB)
path.ls()

CPU times: user 6.47 ms, sys: 593 µs, total: 7.06 ms
Wall time: 15.1 ms


(#7) [Path('/storage/data/imdb/README'),Path('/storage/data/imdb/tmp_lm'),Path('/storage/data/imdb/imdb.vocab'),Path('/storage/data/imdb/tmp_clas'),Path('/storage/data/imdb/test'),Path('/storage/data/imdb/train'),Path('/storage/data/imdb/unsup')]

In [None]:
%%time
# dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=8)
# get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])
dls = TextDataLoaders.from_folder(path, valid = 'test', is_lm=True, bs=16)

In [None]:
dls.show_batch(max_n=4)

# Define classifier

In [113]:
# dls definition cribbed from chapter 10
# this works
'''
dls_clas = DataBlock(
    blocks=(TextBlock.from_folder(path, vocab=dls.vocab),CategoryBlock),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
).dataloaders(path, path=path, bs=128, seq_len=72)

Partial functions allow us to fix a certain number of arguments of a function and generate a new function

The independent variable is often referred to as x, and the dependent variable is often referred to as y.
Here, we are telling fastai what function to call to create the labels in our dataset: 
get_y=parent_label parent_label is a function provided by fastai that simply gets the name of the 
folder a file is in. Because we put each of our bear images into folders based on the type of bear, 
this is going to give us the labels that we need.

get_items: get_items is completely decoupled from get_x and get_y: it is there to return all your items from 
the source. You can pass get_x and get_y (or a list of getters) to explain how to get your x and y from the result of 
get_items and they both default to noop (which is why when get_items return filenames, we don’t pass a get_x)

GrandparentSplitter - Split items from the grand parent folder names (train_name and valid_name).

seq_len: The LMDataLoader will concatenate all texts (maybe shuffled) in one big stream, 
split it in bs contiguous sentences, then go through those seq_len at a time.

'''

"\ndls_clas = DataBlock(\n    blocks=(TextBlock.from_folder(path, vocab=dls.vocab),CategoryBlock),\n    get_y = parent_label,\n    get_items=partial(get_text_files, folders=['train', 'test']),\n    splitter=GrandparentSplitter(valid_name='test')\n).dataloaders(path, path=path, bs=128, seq_len=72)\n\n\n\nThe independent variable is often referred to as x, and the dependent variable is often referred to as y.\nHere, we are telling fastai what function to call to create the labels in our dataset: \nget_y=parent_label parent_label is a function provided by fastai that simply gets the name of the \nfolder a file is in. Because we put each of our bear images into folders based on the type of bear, \nthis is going to give us the labels that we need.\n\n"

In [None]:
'''
# details on sequence length meaining

bs,sl = 4,3
ints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]).map(tensor)

dl = LMDataLoader(ints, bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])
'''

In [None]:
'''
DEFINITION OF DATABLOCK:

Generic container to quickly build Datasets and DataLoaders

To build a DataBlock you need to give the library four things: the types of your input/labels, 
and at least two functions: get_items and splitter. You may also need to include get_x and get_y 
or a more generic list of getters that are applied to the results of get_items.

Once those are provided, you automatically get a Datasets or a DataLoaders
'''

In [115]:
# refactored definition of dataloader object
'''dls = TextDataLoaders.from_df(
    df_tok, path=path, 
    vocab = make_vocab(count),
    text_col = 'text',label_col='label')
'''

dls_clas = TextDataLoaders.from_folder(path=path,
    blocks=(TextBlock.from_folder(path, vocab=dls.vocab),CategoryBlock),
#path, vocab=dls.vocab,     
#    text_col= 'text', 
#   label_col='label',
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test'),
#    get_items=partial(get_text_files, folders=['train', 'test']),
#    splitter=GrandparentSplitter(valid_name='test'), 
    bs=128, seq_len=72)

In [116]:
dls_clas.path

Path('/storage/data/imdb')

In [117]:
keep_path = path

In [118]:
# ch 10 style Path('/storage/data/imdb')
path

Path('/storage/data/imdb')

In [119]:
%%time
# define a text_classifier_learner object
learn_clas = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, 
                                metrics=accuracy).to_fp16()

CPU times: user 2.73 s, sys: 4.07 s, total: 6.8 s
Wall time: 1.81 s


In [120]:
# Path('/storage/data/imdb')
learn_clas.path

Path('/storage/data/imdb')

In [121]:
%%time
# set the path to the location of the encoder
learn_clas.path = Path('/notebooks/temp')

CPU times: user 24 µs, sys: 11 µs, total: 35 µs
Wall time: 39.3 µs


In [122]:
# load the encoder that was saved when the language model was trained
learn_clas = learn_clas.load_encoder('ft_'+modifier)

In [123]:
learn_clas.path = path

In [124]:
# ch 10 style Path('/storage/data/imdb')
learn_clas.path

Path('/storage/data/imdb')

In [125]:
%%time
learn_clas.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.416182,,,01:37


  warn("Your generator is empty.")


CPU times: user 1min 7s, sys: 30.5 s, total: 1min 37s
Wall time: 1min 37s


In [None]:
%%time
learn_clas.freeze_to(-2)
learn_clas.fit_one_cycle(1, 2e-2)

In [None]:
preds = learn_clas.predict("this film shows incredibly bad writing and is a complete disaster")

In [86]:
preds

('neg', TensorText(0), TensorText([9.9998e-01, 2.0704e-05]))

In [33]:
preds = learn_clas.predict("what a terrible film")

In [23]:
preds

('neg', TensorText(0), TensorText([9.9956e-01, 4.4422e-04]))

In [17]:
learn_clas.save('classifier_single_epoch_'+modifier+'b')

Path('/notebooks/temp/models/classifier_single_epoch_mar3b.pth')