# Training a text classifier model on a standalone dataset with fastai
- This notebook ingests the Kaggle Covid tweets dataset (https://www.kaggle.com/datatattle/covid-19-nlp-text-classification)
- This notebook assumes you have already run text_standalone_dataset_lm.ipynb notebook to create a language model
- The encoder from the language model is used to create the text classifier

In [13]:
#hide
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [14]:
#hide
from fastbook import *
from fastai.text.all import *

In [15]:
modifier = 'standalone_mar20'

# Ingest the dataset
- define the path for the dataset
- create a TextDataLoaders object

In [16]:
%%time
# create dataloaders object
path = URLs.path('covid_tweets')
path.ls()

CPU times: user 0 ns, sys: 3.81 ms, total: 3.81 ms
Wall time: 3.24 ms


(#2) [Path('/storage/archive/covid_tweets/train'),Path('/storage/archive/covid_tweets/test')]

In [17]:
# read the training CSV into a dataframe - note that the encoding parameter is needed to avoid a decode error
df_train = pd.read_csv(path/'train/Corona_NLP_train.csv',encoding = "ISO-8859-1")

# Define the text classifier

In [18]:
%%time
# create TextDataLoaders object
dls = TextDataLoaders.from_df(df_train, path=path, text_col='OriginalTweet',label_col='Sentiment')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,"xxbos xxrep 5 ? ? ? xxrep 7 ? ? ? xxrep 7 ? xxrep 4 ? xxrep 4 ? xxrep 11 ? ? ? xxrep 6 ? xxrep 4 ? , xxrep 3 ? xxrep 3 ? ? ? xxrep 3 ? xxrep 4 ? xxrep 3 ? ? ? ? ? xxrep 4 ? ? ? xxrep 3 ? , xxrep 4 ? ? ? ? ? xxrep 6 ? xxrep 3 ? xxrep 3 ? xxrep 3 ? ? ? xxrep 3 ? \r\r\n▁ xxrep 5 ? xxrep 6 ? ? ? xxrep 3 ? xxrep 4 ? xxrep 4 ? ? ? xxrep 4 ? xxrep 6 ? xxrep 4 ? xxrep 8 ? ? ? xxrep 6 ? ? ? xxrep 5 ? ? ? xxrep 3 ? xxrep 4 ? ? ? xxrep 7 ? xxrep 5 ? - xxrep 8 ? xxrep 5",Neutral
1,"xxbos xxmaj fun xxmaj riding 4 xxmaj xxunk , xxmaj shield xxmaj bash # xxmaj cod # callofduty # xxmaj practice # xxmaj xxunk # xxmaj xxunk # xxmaj recreation # xxmaj fun # xxmaj bored # todo # xxmaj coronavirus # xxmaj quarantine # xxmaj isolation # toiletpaper # xxmaj lockdown # xxmaj art # xxmaj milk # xxmaj water # xxmaj xxunk # xxmaj weather # xxmaj cleveland # xxmaj ohio # xxmaj browns # xxup nfl # xxmaj xxunk # xxmaj poetry \r\r\n https : / / t.co / xxunk via @youtube",Positive
2,"xxbos xxmaj friends ! xxmaj it 's xxmaj march 25 , 2020 at 03:00pm- time to xxup stop xxup renting & & buy a # home from # realtor xxmaj kally ( khoelcher ( at ) gmail ( dot ) com ) of # xxmaj goodyear # xxmaj arizona # coldwellbanker ( 269)240 - 8824 . # xxup n95 masks , # gloves , & & hand # sanitizer provided to xxup prevent # coronavirus . # xxmaj avondale # xxmaj buckeye # â ▁ https : / / t.co / xxunk",Extremely Positive


CPU times: user 52.2 s, sys: 1.35 s, total: 53.5 s
Wall time: 56.9 s


In [19]:
dls.path

Path('/storage/archive/covid_tweets')

In [20]:
# save the current path
keep_path = path
print("keep_path is: ",str(keep_path))

keep_path is:  /storage/archive/covid_tweets


In [21]:
%%time
# define a text_classifier_learner object
learn_clas = text_classifier_learner(dls, AWD_LSTM, 
                                metrics=accuracy).to_fp16()

CPU times: user 5.07 s, sys: 834 ms, total: 5.91 s
Wall time: 1.13 s


# Fine-tune the text classifier
Use the encoder created as part of training the language model to fine tune the text classifier

In [22]:
# Path('/storage/data/imdb')
learn_clas.path

Path('/storage/archive/covid_tweets')

In [23]:
%%time
# set the path to the location of the encoder
learn_clas.path = Path('/notebooks/temp')

CPU times: user 277 µs, sys: 17 µs, total: 294 µs
Wall time: 49.4 µs


In [24]:
# load the encoder that was saved when the language model was trained
learn_clas = learn_clas.load_encoder('ft_standalone'+modifier)

In [25]:
learn_clas.path

Path('/notebooks/temp')

In [26]:
# set the path back to the original path
learn_clas.path = keep_path

In [27]:
# ch 10 style Path('/storage/data/imdb')
learn_clas.path

Path('/storage/archive/covid_tweets')

In [28]:
%%time
# fine tune the model
learn_clas.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.461614,1.281477,0.453286,00:37


CPU times: user 29.9 s, sys: 7.19 s, total: 37.1 s
Wall time: 37.4 s


In [29]:
x, y = first(dls.train)
x.shape, y.shape, len(dls.train)

(torch.Size([64, 166]), torch.Size([64]), 514)

In [30]:
dls.show_batch()

Unnamed: 0,text,category
0,"xxbos xxrep 5 ? ? ? xxrep 7 ? ? ? xxrep 7 ? xxrep 4 ? xxrep 4 ? xxrep 11 ? ? ? xxrep 6 ? xxrep 4 ? , xxrep 3 ? xxrep 3 ? ? ? xxrep 3 ? xxrep 4 ? xxrep 3 ? ? ? ? ? xxrep 4 ? ? ? xxrep 3 ? , xxrep 4 ? ? ? ? ? xxrep 6 ? xxrep 3 ? xxrep 3 ? xxrep 3 ? ? ? xxrep 3 ? \r\r\n▁ xxrep 5 ? xxrep 6 ? ? ? xxrep 3 ? xxrep 4 ? xxrep 4 ? ? ? xxrep 4 ? xxrep 6 ? xxrep 4 ? xxrep 8 ? ? ? xxrep 6 ? ? ? xxrep 5 ? ? ? xxrep 3 ? xxrep 4 ? ? ? xxrep 7 ? xxrep 5 ? - xxrep 8 ? xxrep 5",Neutral
1,xxbos xxmaj dr xxmaj xxunk xxmaj mirza : xxup mbbs - xxup rmc xxmaj pakistan \r\r\n msc xxmaj public xxmaj health - xxup lsh xxup uk . \r\r\n ex - global xxmaj coordinator xxup who . \r\r\n ex - regional xxmaj adviser xxup who . \r\r\n xxmaj founder & & xxmaj executive xxmaj coordinator - xxmaj the xxmaj network for xxmaj consumer xxmaj protection xxmaj pakistan \r\r\n xxmaj my lord : ba - xxmaj national xxmaj college xxmaj karachi . \r\r\n xxup llb - xxmaj sindh xxmaj muslim xxmaj law college . # xxup covid2019 https : / / t.co / xxunk,Neutral
2,xxbos xxmaj so glad i donât : \r\r\n xxmaj live in a big city . \r\r\n xxmaj rely on takeout / dine out . \r\r\n xxmaj rely on delivery . \r\r\n xxmaj rely on transport . \r\r\n\r\r\n xxmaj so glad i xxup do : \r\r\n xxmaj have 6 month stock of food . \r\r\n xxmaj own guns / ammo . \r\r\n xxmaj have most family / friends within blocks . \r\r\n xxmaj live in a small town with woods . \r\r\n xxmaj have a big truck / fam car . \r\r\n▁ # coronavirus https : / / t.co / xxunk,Extremely Positive
3,xxbos xxmaj we have xxup amazing xxup cheap xxup deals ! xxup for xxup the # xxup covid2019 going on to help you xxrep 3 ? \r\r\n▁ # xxmaj trials \r\r\n▁ # xxmaj monthly \r\r\n▁ # xxmaj yearly \r\r\n xxmaj and xxmaj resonable # xxmaj prices / # xxmaj subscriptions \r\r\n xxmaj just xxup dm xxup us ! # bestiptv # iptv # xxmaj service # xxmaj iptv # iptvdeals # xxmaj cheap # iptv # xxmaj football # xxup hd # xxmaj movies # xxmaj adult # xxmaj cinema # hotmovies # xxmaj cheap xxup instant xxup setup !,Extremely Positive
4,xxbos xxmaj gold xxmaj prices xxmaj suffer xxmaj as xxmaj severe xxmaj sell xxmaj off xxmaj hits xxmaj the xxmaj markets \r\r\n\r\r\n xxmaj if ever there was blood in the streets … .. \r\r\n\r\r\n https : / / t.co / xxunk \r\r\n\r\r\n▁ # gold \r\r\n▁ # silver \r\r\n▁ # mining \r\r\n▁ # xxmaj trade \r\r\n▁ # investments \r\r\n▁ # speculator \r\r\n▁ # xxmaj markets \r\r\n▁ # xxmaj powell \r\r\n▁ # profits \r\r\n▁ # money \r\r\n▁ # xxmaj oil \r\r\n▁ # xxmaj putin \r\r\n▁ # xxmaj trump \r\r\n▁ # coronavirus \r\r\n▁ # stocks \r\r\n▁ # xxmaj china \r\r\n▁ # xxmaj italy,Extremely Negative
5,xxbos 1 / 4 \r\r\n\r\r\n xxmaj hereâs xxmaj my xxmaj top 20 xxup covid-19 xxmaj playlist . xxmaj whatâs xxmaj on xxmaj yours ? \r\r\n\r\r\n 1 . âdonât xxmaj stand xxmaj so xxmaj close xxmaj to xxmaj meâ â xxmaj the xxmaj police \r\r\n 2 . xxunk xxmaj xxunk â xxmaj xxunk \r\r\n 3 . xxunk xxmaj outâ â xxmaj alice xxmaj cooper \r\r\n 4 . âi xxmaj donât xxmaj need xxmaj no xxmaj xxunk â xxmaj ray xxmaj charles \r\r\n 5 . xxunk xxmaj in the xxmaj xxunk â xxmaj the xxmaj clash,Negative
6,"xxbos xxmaj men , xxup step xxup the # xxup xxunk xxup up ! ! xxmaj during this # xxmaj coronavirus pandemic , xxmaj do xxup not allow your xxunk / childrenâs mom , to go to the supermarket with the kids ! ! xxup you xxup go ! ! xxmaj whether you live with her or not # xxup xxunk , xxup you # xxunk and xxup you take the risk for # xxup covid19 , not your kids . # xxunk ? ? https : / / t.co / xxunk",Negative
7,"xxbos i got ur # toiletpaper ? right xxup here ! \r\r\n â \r\r\n xxmaj thought it was a # xxup hoax , oh wait no , only 15 positive & & it should be 0 soon , or is it , will just go away by miracle ? xxmaj yeah … xxmaj thatâs xxmaj it ? . \r\r\n â \r\r\n xxmaj had no idea he xxunk even read & & write ? ? ! / # xxunk ? / .@realdonaldtrump # coronavirus # covid19 https : / / t.co / xxunk",Extremely Positive
8,xxbos xxmaj my emails : \r\r\n\r\r\n xxmaj everyone else : xxmaj an xxmaj update about our xxup covid-19 policies … \r\r\n\r\r\n xxunk : xxup no xxup toilet xxup paper - xxup just xxup deals ! ! xxup down xxup for xxup quarantine xxup and xxup chill xxrep 4 ? xxup make xxup sure xxup you xxup look xxup good xxup in xxup the xxup open xxup xxunk xxup today ? .. ! ! 20 % xxup off xxup our xxup prices xxup and xxup the xxup population xxrep 4 !,Positive


In [31]:
learn_clas.summary()

epoch,train_loss,valid_loss,accuracy,time
0,,,00:00,


SequentialRNN (Input shape: ['64 x 166'])
Layer (type)         Output Shape         Param #    Trainable 
LSTM                 ['64 x 22 x 1152',   1,852,416  False     
________________________________________________________________
LSTM                 ['64 x 22 x 1152',   5,317,632  False     
________________________________________________________________
LSTM                 ['64 x 22 x 400', "  1,846,400  False     
________________________________________________________________
RNNDropout           64 x 22 x 400        0          False     
________________________________________________________________
RNNDropout           64 x 22 x 1152       0          False     
________________________________________________________________
RNNDropout           64 x 22 x 1152       0          False     
________________________________________________________________
BatchNorm1d          64 x 1200            2,400      True      
________________________________________________________

# Exercise the text classifier
Apply the fine-tuned text classifier on some text samples.

In [41]:
preds = learn_clas.predict("the government's approach to the pandemic has been a complete disaster")

In [42]:
preds

('Negative',
 TensorText(2),
 TensorText([0.3328, 0.0545, 0.3551, 0.1026, 0.1551]))

In [43]:
preds = learn_clas.predict("the new vaccines hold the promise of a quick return to economic growth")

In [44]:
preds

('Extremely Positive',
 TensorText(1),
 TensorText([0.0565, 0.3758, 0.1528, 0.0699, 0.3450]))

In [45]:
preds = learn_clas.predict("this flu is about what we would expect in a normal winter")

In [46]:
preds

('Negative',
 TensorText(2),
 TensorText([0.2712, 0.0407, 0.3615, 0.1584, 0.1682]))

In [47]:
preds = learn_clas.predict("the health ministry needs to pay closer attention to the vaccine rollout")

In [48]:
preds

('Positive',
 TensorText(4),
 TensorText([0.0927, 0.1448, 0.3081, 0.1216, 0.3327]))

In [40]:
# save the classifier model
learn_clas.path = Path('/notebooks/temp')
learn_clas.save('classifier_single_epoch_'+modifier+'d')

Path('/notebooks/temp/models/classifier_single_epoch_standalone_mar20d.pth')