# Training a text classifier model on a standalone dataset with fastai
- This notebook ingests the Kaggle Covid tweets dataset (https://www.kaggle.com/datatattle/covid-19-nlp-text-classification)
- This notebook assumes you have already run text_standalone_dataset_lm.ipynb notebook to create a language model
- The encoder from the language model is used to create the text classifier

In [1]:
#hide
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [2]:
#hide
from fastbook import *
from fastai.text.all import *

In [3]:
modifier = 'standalone_mar17'

# Ingest the dataset
- define the path for the dataset
- create a TextDataLoaders object

In [4]:
%%time
# create dataloaders object
path = URLs.path('covid_tweets')
path.ls()

CPU times: user 2.71 ms, sys: 465 µs, total: 3.17 ms
Wall time: 3.94 ms


(#2) [Path('/storage/archive/covid_tweets/train'),Path('/storage/archive/covid_tweets/test')]

In [6]:
# read the training CSV into a dataframe - note that the encoding parameter is needed to avoid a decode error
df_train = pd.read_csv(path/'train/Corona_NLP_train.csv',encoding = "ISO-8859-1")

# Define the text classifier

In [8]:
%%time
# create TextDataLoaders object
dls = TextDataLoaders.from_df(df_train, path=path, text_col='OriginalTweet',label_col='Sentiment')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxrep 5 ? ? ? xxrep 7 ? ? ? xxrep 7 ? xxrep 4 ? xxrep 4 ? xxrep 11 ? ? ? xxrep 6 ? xxrep 4 ? , xxrep 3 ? xxrep 3 ? ? ? xxrep 3 ? xxrep 4 ? xxrep 3 ? ? ? ? ? xxrep 4 ? ? ? xxrep 3 ? , xxrep 4 ? ? ? ? ? xxrep 6 ? xxrep 3 ? xxrep 3 ? xxrep 3 ? ? ? xxrep 3 ? \r\r\n▁ xxrep 5 ? xxrep 6 ? ? ? xxrep 3 ? xxrep 4 ? xxrep 4 ? ? ? xxrep 4 ? xxrep 6 ? xxrep 4 ? xxrep 8 ? ? ? xxrep 6 ? ? ? xxrep 5 ? ? ? xxrep 3 ? xxrep 4 ? ? ? xxrep 7 ? xxrep 5 ? - xxrep 8 ? xxrep 5",Neutral
1,xxbos xxmaj dr xxmaj xxunk xxmaj mirza : xxup mbbs - xxup rmc xxmaj pakistan \r\r\n msc xxmaj public xxmaj health - xxup lsh xxup uk . \r\r\n ex - global xxmaj coordinator xxup who . \r\r\n ex - regional xxmaj adviser xxup who . \r\r\n xxmaj founder & & xxmaj executive xxmaj coordinator - xxmaj the xxmaj network for xxmaj consumer xxmaj protection xxmaj pakistan \r\r\n xxmaj my lord : ba - xxmaj national xxmaj college xxmaj karachi . \r\r\n xxup llb - xxmaj sindh xxmaj muslim xxmaj law college . # xxup covid2019 https : / / t.co / xxunk,Neutral
2,xxbos xxup cdc xxup approved ! xxup used xxup in xxup hospitals ! xxup kills xxup viruses ! & & xxmaj we xxmaj have it xxunk \r\r\n https : / / t.co / xxunk \r\r\n xxmaj xxunk xxup xxunk xxup to xxup go ! xxunk - xxunk - xxunk . xxmaj call / xxmaj pay / xxmaj pull xxmaj up & & xxmaj go ! \r\r\n https : / / t.co / xxunk \r\r\n▁ # xxunk # orangecounty # xxunk # xxunk \r\r\n▁ # covid19pandemic # coronavirus # xxunk https : / / t.co / xxunk,Negative


CPU times: user 47.3 s, sys: 2.72 s, total: 50 s
Wall time: 52.8 s


In [9]:
dls.path

Path('/storage/archive/covid_tweets')

In [10]:
# save the current path
keep_path = path
print("keep_path is: ",str(keep_path))

keep_path is:  /storage/archive/covid_tweets


In [11]:
%%time
# define a text_classifier_learner object
learn_clas = text_classifier_learner(dls, AWD_LSTM, 
                                metrics=accuracy).to_fp16()

CPU times: user 4.47 s, sys: 753 ms, total: 5.22 s
Wall time: 968 ms


# Fine-tune the text classifier
Use the encoder created as part of training the language model to fine tune the text classifier

In [12]:
# Path('/storage/data/imdb')
learn_clas.path

Path('/storage/archive/covid_tweets')

In [13]:
%%time
# set the path to the location of the encoder
learn_clas.path = Path('/notebooks/temp')

CPU times: user 34 µs, sys: 2 µs, total: 36 µs
Wall time: 40.5 µs


In [14]:
# load the encoder that was saved when the language model was trained
learn_clas = learn_clas.load_encoder('ft_'+modifier)

In [15]:
learn_clas.path

Path('/notebooks/temp')

In [16]:
# set the path back to the original path
learn_clas.path = keep_path

In [17]:
# ch 10 style Path('/storage/data/imdb')
learn_clas.path

Path('/storage/archive/covid_tweets')

In [18]:
%%time
# fine tune the model
learn_clas.fit_one_cycle(5, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.492627,1.330546,0.406998,00:34
1,1.486987,1.290257,0.4381,00:34
2,1.459331,1.280752,0.432876,00:34
3,1.43128,1.256328,0.458511,00:34
4,1.421083,1.259352,0.456081,00:34


CPU times: user 2min 12s, sys: 34.8 s, total: 2min 47s
Wall time: 2min 52s


In [21]:
x, y = first(dls.train)
x.shape, y.shape, len(dls.train)

(torch.Size([64, 166]), torch.Size([64]), 514)

In [22]:
dls.show_batch()

Unnamed: 0,text,category
0,"xxbos xxrep 5 ? ? ? xxrep 7 ? ? ? xxrep 7 ? xxrep 4 ? xxrep 4 ? xxrep 11 ? ? ? xxrep 6 ? xxrep 4 ? , xxrep 3 ? xxrep 3 ? ? ? xxrep 3 ? xxrep 4 ? xxrep 3 ? ? ? ? ? xxrep 4 ? ? ? xxrep 3 ? , xxrep 4 ? ? ? ? ? xxrep 6 ? xxrep 3 ? xxrep 3 ? xxrep 3 ? ? ? xxrep 3 ? \r\r\n▁ xxrep 5 ? xxrep 6 ? ? ? xxrep 3 ? xxrep 4 ? xxrep 4 ? ? ? xxrep 4 ? xxrep 6 ? xxrep 4 ? xxrep 8 ? ? ? xxrep 6 ? ? ? xxrep 5 ? ? ? xxrep 3 ? xxrep 4 ? ? ? xxrep 7 ? xxrep 5 ? - xxrep 8 ? xxrep 5",Neutral
1,"xxbos xxmaj why ca nt items like xxmaj masks , xxmaj hand xxmaj sanitizers be declared xxup xxunk or even free to poor ? xxmaj when we can do d same for xxup movies why not these items \r\r\n▁ xxrep 5 ? xxrep 6 ? ? ? xxrep 3 ? xxrep 4 ? xxrep 4 ? ? ? xxrep 4 ? xxrep 6 ? xxrep 4 ? xxrep 8 ? ? ? xxrep 6 ? ? ? xxrep 5 ? ? ? xxrep 3 ? xxrep 4 ? ? ? xxrep 7 ? xxrep 5 ? - xxrep 8 ? xxrep 5 ? xxrep 4 ? ? ? xxrep 4 ? https : / / t.co / xxunk",Extremely Positive
2,"xxbos # coronacrisis xxmaj the xxmaj americans queueing for guns , xxmaj the xxmaj dutch for xxmaj cannabis & & xxmaj the xxmaj british for # toiletpaper \r\r\n xxmaj what the xxmaj saudis r queueing for ? \r\r\n▁ # xxmaj covid_19 # xxunk # coronavirus # coronaviruspandemic \r\r\n▁ # aramco # xxrep 4 ? _ xxrep 4 ? \r\r\n▁ # xxrep 3 ? _ xxrep 5 ? \r\r\n▁ # xxrep 3 ? _ xxrep 5 ? _ xxrep 4 ? _ xxrep 8 ? \r\r\n▁ # xxrep 4 ? _ ? ? _ xxrep 5 ? _ ? ? _ xxrep 3 ? _ xxrep 7 ? https : / / t.co / xxunk m",Neutral
3,"xxbos # xxup xxunk : xxup xxunk ' xxup back & & xxup forth xxup in xxup my xxup chair , xxup wearin ' xxup my xxup xxunk , xxup wrapped xxup in xxup my xxup blanket , xxup xxunk ' xxup exhausted , xxup xxunk ' xxunk xxup xxunk ' xxup in xxup line xxup at xxup the xxup supermarket , xxup xxunk ' xxup like xxup i m xxup cool xxup wit ' # xxup socialdistancing xxup there … . xxup why i xxup have xxup to xxup wait xxup so xxup long xxup before xxup xxunk https : / / t.co / xxunk",Positive
4,xxbos xxmaj dr xxmaj xxunk xxmaj mirza : xxup mbbs - xxup rmc xxmaj pakistan \r\r\n msc xxmaj public xxmaj health - xxup lsh xxup uk . \r\r\n ex - global xxmaj coordinator xxup who . \r\r\n ex - regional xxmaj adviser xxup who . \r\r\n xxmaj founder & & xxmaj executive xxmaj coordinator - xxmaj the xxmaj network for xxmaj consumer xxmaj protection xxmaj pakistan \r\r\n xxmaj my lord : ba - xxmaj national xxmaj college xxmaj karachi . \r\r\n xxup llb - xxmaj sindh xxmaj muslim xxmaj law college . # xxup covid2019 https : / / t.co / xxunk,Neutral
5,xxbos # xxmaj coronavirus ? ? xxrep 5 ? xxrep 3 ? xxrep 5 ? ? ? xxrep 3 ? xxrep 7 ? ? ? xxrep 5 ? xxrep 4 ? ? ? ? ? xxrep 3 ? xxrep 3 ? ? ? xxrep 3 ? ? ? xxrep 7 ? xxrep 4 ? xxrep 3 ? xxrep 12 ? @narendramodi ? ? xxrep 3 ? xxrep 5 ? ? ? xxrep 5 ? xxrep 4 ? ? ? xxrep 4 ? \r\r\n▁ # xxunk # stayhome # xxmaj lockdown2 https : / / t.co / xxunk,Neutral
6,xxbos xxmaj united # xxunk xxmaj workers / # xxup ufcw # xxmaj xxunk # xxmaj union xxmaj agreement w / # xxmaj safeway \r\r\n xxmaj for # xxmaj grocery xxmaj store xxmaj workers \r\r\n▁ # xxmaj coronavirus / # xxup covid19 \r\r\n xxmaj parents xxmaj xxunk xxmaj flex xxmaj schedule \r\r\n xxmaj expand xxmaj paid sickleave \r\r\n xxmaj exposed : xxmaj to 2 wks xxmaj paid xxunk xxmaj before xxmaj use sickleave \r\r\n $ 2 / hr xxmaj raise \r\r\n xxmaj existing xxmaj over xxmaj temps \r\r\n https : / / t.co / xxunk,Extremely Positive
7,xxbos # xxup coronavirus xxup alert : xxmaj we understand how important it is for everyone to keep clean & & disinfected \r\r\n xxmaj let xxmaj us xxmaj help ? xxmaj all xxmaj supplies xxmaj needed f / xxmaj disinfecting & & xxmaj cleaning xxmaj provided \r\r\n xxmaj prices xxmaj starting @ 2 xxmaj hours 2 xxmaj maids $ 75 \r\r\n xxmaj book xxmaj online or xxmaj call xxmaj today @ \r\r\n https : / / t.co / xxunk / ( xxunk - xxunk \r\r\n▁ # lasvegas https : / / t.co / xxunk,Extremely Positive
8,"xxbos xxmaj hey # xxunk , # xxunk : u xxmaj guys xxmaj see xxmaj what 's xxmaj going xxmaj on , # xxunk , xxmaj who xxmaj are xxmaj richer , xxmaj are xxmaj able xxmaj to xxmaj buy # xxmaj coronavirus xxmaj supplies xxmaj even xxmaj at xxmaj higher xxmaj prices xxmaj while u xxmaj guys xxmaj ca n't ! \r\r\n xxmaj maybe xxmaj it 's xxmaj time xxmaj for xxmaj your # xxmaj governors xxmaj to xxmaj force @realdonaldtrump xxmaj to xxmaj enact xxmaj the # defenseproductionact ! \r\r\n▁ # justsaying",Positive


In [23]:
learn_clas.summary()

epoch,train_loss,valid_loss,accuracy,time
0,,,00:00,


SequentialRNN (Input shape: ['64 x 166'])
Layer (type)         Output Shape         Param #    Trainable 
LSTM                 ['64 x 22 x 1152',   1,852,416  False     
________________________________________________________________
LSTM                 ['64 x 22 x 1152',   5,317,632  False     
________________________________________________________________
LSTM                 ['64 x 22 x 400', "  1,846,400  False     
________________________________________________________________
RNNDropout           64 x 22 x 400        0          False     
________________________________________________________________
RNNDropout           64 x 22 x 1152       0          False     
________________________________________________________________
RNNDropout           64 x 22 x 1152       0          False     
________________________________________________________________
BatchNorm1d          64 x 1200            2,400      True      
________________________________________________________

# Exercise the text classifier
Apply the fine-tuned text classifier on some text samples.

In [24]:
preds = learn_clas.predict("this film shows incredibly bad writing and is a complete disaster")

In [25]:
preds

('Extremely Negative',
 TensorText(0),
 TensorText([0.4676, 0.1553, 0.2021, 0.0393, 0.1358]))

In [26]:
preds = learn_clas.predict("this film shows incredible talent and is a complete triumph")

In [27]:
preds

('Extremely Positive',
 TensorText(1),
 TensorText([0.0080, 0.7712, 0.0260, 0.0231, 0.1717]))

In [25]:
# save the classifier model
learn_clas.path = Path('/notebooks/temp')
learn_clas.save('classifier_single_epoch_'+modifier+'d')

Path('/notebooks/temp/models/classifier_single_epoch_mar3d.pth')