# Training text models with fastai
- this notebook contains the steps to train a deep learning model on a text dataset
- the first section

In [1]:
#hide
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [2]:
#hide
from fastbook import *
from fastai.text.all import *

In [None]:
modifier = 'mar3'

# Training a language model
- take a pretrained model and train it some more using the IMDB dataset

In [3]:
# chapter 10 language model definition
'''
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])

dls_lm = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True),
    get_items=get_imdb, splitter=RandomSplitter(0.1)
).dataloaders(path, path=path, bs=128, seq_len=80)

learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, 
    metrics=[accuracy, Perplexity()]).to_fp16()
    
learn.fit_one_cycle(1, 2e-2)

learn.unfreeze()
learn.fit_one_cycle(10, 2e-3)

learn.save_encoder('finetuned')

'''

"\nget_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])\n\ndls_lm = DataBlock(\n    blocks=TextBlock.from_folder(path, is_lm=True),\n    get_items=get_imdb, splitter=RandomSplitter(0.1)\n).dataloaders(path, path=path, bs=128, seq_len=80)\n\nlearn = language_model_learner(\n    dls_lm, AWD_LSTM, drop_mult=0.3, \n    metrics=[accuracy, Perplexity()]).to_fp16()\n    \nlearn.fit_one_cycle(1, 2e-2)\n\nlearn.unfreeze()\nlearn.fit_one_cycle(10, 2e-3)\n\nlearn.save_encoder('finetuned')\n\n"

In [4]:
%%time
# create dataloaders object
'''dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=16)
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)'''
# LSTM have multiple dropout probabilities for different things. Once you set them, this drop_mult property scales all of them. So you can change all dropout probabilities simultaneously using this, keeping their relative size
path = untar_data(URLs.IMDB)
path.ls()

CPU times: user 3.33 ms, sys: 871 µs, total: 4.2 ms
Wall time: 23.8 ms


(#7) [Path('/storage/data/imdb/README'),Path('/storage/data/imdb/tmp_lm'),Path('/storage/data/imdb/imdb.vocab'),Path('/storage/data/imdb/tmp_clas'),Path('/storage/data/imdb/test'),Path('/storage/data/imdb/train'),Path('/storage/data/imdb/unsup')]

In [5]:
%%time
# dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=8)
get_imdb = partial(get_text_files, folders=['train', 'test', 'unsup'])
dls = TextDataLoaders.from_folder(path, valid = 'test', is_lm=True, bs=16)

In [6]:
dls.show_batch(max_n=4)

Unnamed: 0,text,text_
0,"xxbos 50 years old , this musical comedy fantasy might look its age , but it wears it with dignity . \n\n xxmaj this film is still great fun . xxmaj crosby was never really romantic lead material , but he delivers the material with the lightly humorous edge it needs . xxmaj bendix plays broad and is huge fun in a part which calls upon his strengths . xxmaj hardwicke -","50 years old , this musical comedy fantasy might look its age , but it wears it with dignity . \n\n xxmaj this film is still great fun . xxmaj crosby was never really romantic lead material , but he delivers the material with the lightly humorous edge it needs . xxmaj bendix plays broad and is huge fun in a part which calls upon his strengths . xxmaj hardwicke - how"
1,"something despite stating that he wants to escape so he can see his daughter again , and xxmaj kate becomes emotionally tough seconds after going to pieces over someone that ripped her off for a xxunk . xxmaj yeah . \n\n xxmaj after starting out as a "" this could happen to anyone "" movie , it quickly falls apart as it introduces ideas that make it more and more unrealistic .","despite stating that he wants to escape so he can see his daughter again , and xxmaj kate becomes emotionally tough seconds after going to pieces over someone that ripped her off for a xxunk . xxmaj yeah . \n\n xxmaj after starting out as a "" this could happen to anyone "" movie , it quickly falls apart as it introduces ideas that make it more and more unrealistic . a"
2,"--polarisdib xxbos xxmaj this film takes you on one family 's impossible journey , and makes you feel every step of their odyssey . xxmaj beautifully acted and photographed , heartbreakingly real . xxmaj its last line , with its wistful hope , is one of the more powerful in memory . xxbos xxup ok , this has got 2 be one of the worst excuses 4 a movie that i have","xxbos xxmaj this film takes you on one family 's impossible journey , and makes you feel every step of their odyssey . xxmaj beautifully acted and photographed , heartbreakingly real . xxmaj its last line , with its wistful hope , is one of the more powerful in memory . xxbos xxup ok , this has got 2 be one of the worst excuses 4 a movie that i have ever"
3,"xxmaj really bad and redundant special effects , zombies that look like normal people ( except for a white additional skin pulled over their faces ) , xxup way xxup to xxup much fake blood ( i like realism a lot , the combination of realism and xxmaj zombie films being debatable , but the presented gore is just plain silly ) . xxmaj the camera stays quite long with xxunk scenes","really bad and redundant special effects , zombies that look like normal people ( except for a white additional skin pulled over their faces ) , xxup way xxup to xxup much fake blood ( i like realism a lot , the combination of realism and xxmaj zombie films being debatable , but the presented gore is just plain silly ) . xxmaj the camera stays quite long with xxunk scenes ,"


In [None]:
%%time
# define and train model
learn = language_model_learner(dls,AWD_LSTM,drop_mult=0.5,metrics=accuracy).to_fp16()
learn.fine_tune(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,4.51207,4.297565,0.271312,31:19


epoch,train_loss,valid_loss,accuracy,time


In [None]:
# get prediction
# preds = learn.predict('The star is', n_words=20)
learn.predict("what comes next", n_words=20)

In [None]:
learn.export('/notebooks/temp/models/lm_model_'+modifier)

In [None]:
keep_path = learn.path

In [None]:
# workaround to make path writeable
learn.path = Path('/notebooks/temp')

In [None]:
learn.path

In [None]:
learn.model_dir

In [None]:
learn.save('lm_'+modifier)

In [None]:
# workaround to save encoder - need to do this to later load encoder for classifier
learn.save_encoder('ft_'+modifier)

In [None]:
learn.path = keep_path

In [None]:
learn2 = load_learner('/notebooks/temp/models/lm_model_'+modifier)

In [None]:
learn2.fit_one_cycle(10, 2e-3)

# classifier experiment

In [None]:
'''
%%time


dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=16)
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(1, 1e-2)
'''

In [None]:
# dls definition cribbed from chapter 10
dls_clas = DataBlock(
    blocks=(TextBlock.from_folder(path, vocab=dls.vocab),CategoryBlock),
    get_y = parent_label,
    get_items=partial(get_text_files, folders=['train', 'test']),
    splitter=GrandparentSplitter(valid_name='test')
).dataloaders(path, path=path, bs=128, seq_len=72)

In [None]:
path

In [None]:
learn_clas = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, 
                                metrics=accuracy).to_fp16()

In [None]:
%%time
# learn.path = Path('/notebooks/temp')
learn_clas.path = Path('/notebooks/temp')

In [None]:
learn_clas = learn_clas.load_encoder('ft_'+modifier)

In [None]:
learn_clas.fit_one_cycle(1, 2e-2)

In [None]:
%%time
learn_clas.freeze_to(-2)
learn_clas.fit_one_cycle(1, 2e-2)

In [None]:
preds = learn_clas.predict("this film shows incredibly bad writing and is a complete disaster")

In [None]:
preds

In [None]:
preds = learn_clas.predict("what a wonderful film")

In [None]:
learn_clas.save('classifier_single_epoch_'+modifier)