# Training a classifier with a text dataset in fast.ai
Example of training a text classification model with fast.ai by using a curated text dataset 


In [1]:
# imports for notebook boilerplate
!pip install -Uqq fastbook
import fastbook
from fastbook import *
from fastai.text.all import *


In [2]:
# set up the notebook for fast.ai
fastbook.setup_book()

In [3]:
# ingest the curated text dataset ADULT_SAMPLE
path = untar_data(URLs.IMDB_SAMPLE)


In [4]:
path.ls()

(#4) [Path('/storage/data/imdb_sample/texts.csv'),Path('/storage/data/imdb_sample/imdb_sample_model.pkl'),Path('/storage/data/imdb_sample/models'),Path('/storage/data/imdb_sample/imdb_sample.pkl')]

In [5]:
# ingest the train dataset into a Pandas dataframe
df_train = pd.read_csv(path/'texts.csv')
df_train.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [6]:
# get the tokens and token counts for the dataframe
df_tok, count = tokenize_df(df_train,[df_train.columns[1]])

In [7]:
df_tok.head(3)

Unnamed: 0,label,is_valid,text,text_length
0,negative,False,"[xxbos, xxmaj, un, -, bleeping, -, believable, !, xxmaj, meg, xxmaj, ryan, does, n't, even, look, her, usual, pert, lovable, self, in, this, ,, which, normally, makes, me, forgive, her, shallow, ticky, acting, schtick, ., xxmaj, hard, to, believe, she, was, the, producer, on, this, dog, ., xxmaj, plus, xxmaj, kevin, xxmaj, kline, :, what, kind, of, suicide, trip, has, his, career, been, on, ?, xxmaj, whoosh, …, xxmaj, banzai, xxrep, 3, !, xxmaj, finally, this, was, directed, by, the, guy, who, did, xxmaj, big, xxmaj, chill, ?, xxmaj, must, be, a, replay, of, xxmaj, jonestown, -, hollywood,...",108
1,positive,False,"[xxbos, xxmaj, this, is, a, extremely, well, -, made, film, ., xxmaj, the, acting, ,, script, and, camera, -, work, are, all, first, -, rate, ., xxmaj, the, music, is, good, ,, too, ,, though, it, is, mostly, early, in, the, film, ,, when, things, are, still, relatively, cheery, ., xxmaj, there, are, no, really, superstars, in, the, cast, ,, though, several, faces, will, be, familiar, ., xxmaj, the, entire, cast, does, an, excellent, job, with, the, script, ., \n\n, xxmaj, but, it, is, hard, to, watch, ,, because, there, is, no, good, end, to, a, situation, like, the, one, ...]",462
2,negative,False,"[xxbos, xxmaj, every, once, in, a, long, while, a, movie, will, come, along, that, will, be, so, awful, that, i, feel, compelled, to, warn, people, ., xxmaj, if, i, labor, all, my, days, and, i, can, save, but, one, soul, from, watching, this, movie, ,, how, great, will, be, my, joy, ., \n\n, xxmaj, where, to, begin, my, discussion, of, pain, ., xxmaj, for, starters, ,, there, was, a, musical, montage, every, five, minutes, ., xxmaj, there, was, no, character, development, ., xxmaj, every, character, was, a, stereotype, ., xxmaj, we, had, swearing, guy, ,, fat, guy, who, eats, donuts, ...]",220


In [8]:
count.most_common(10)

[('xxmaj', 24930),
 ('the', 14467),
 (',', 11834),
 ('.', 11738),
 ('and', 6949),
 ('a', 6782),
 ('of', 6370),
 ('to', 5847),
 ('is', 4429),
 ('it', 4071)]

In [9]:
# get the count value for a very common word, a moderately common
# word and a rare word
print("very common word (count['the']):", count['the'])
print("moderately common word (count['prepared']):", count['prepared'])
print("rare word (count['gaga']):", count['gaga'])

very common word (count['the']): 14467
moderately common word (count['prepared']): 6
rare word (count['gaga']): 0


# Create and train the model
- create TextDataLoaders object
- define and train model

In [10]:
dls = TextDataLoaders.from_df(df_tok, path=path, 
    vocab = make_vocab(count),text_col = 'text', is_lm=True)

In [11]:
#define the model
learn = language_model_learner(dls,AWD_LSTM,metrics=accuracy)


In [12]:
# fit the model with one epoch, LR = 0.02, and momentum = ( 0.8 , 0.7 , 0.8 )
# learn.fit_one_cycle( 1 , 0.02, moms = ( 0.8 , 0.7 , 0.8 ) )
learn.fit_one_cycle( 1 , 0.02 )

epoch,train_loss,valid_loss,accuracy,time
0,1.639356,1.457713,0.771113,00:59


In [14]:
preds = learn.predict('The star is', n_words=20)

In [15]:
preds

"The star is words ' to ' , sharing that complex with typical of vargas , most of good ' grinch ' ,"

In [23]:
# .str.replace(',',''), '\''
preds2 = preds.replace(', ','').replace('\' ','')

In [24]:
preds2

'What comes next conventional low ,'

In [25]:
dls = TextDataLoaders.from_df(df_tok, path=path, 
    vocab = make_vocab(count),text_col = 'text',label_col='label')

In [27]:
learn = text_classifier_learner( dls , AWD_LSTM , metrics=accuracy )

In [38]:
learn.fit_one_cycle( 5 , 0.02 )

epoch,train_loss,valid_loss,accuracy,time
0,0.755471,0.671588,0.585,00:22
1,0.759537,0.795209,0.53,00:22
2,0.754129,0.733259,0.585,00:22
3,0.738281,0.899029,0.505,00:22
4,0.734202,0.94448,0.485,00:22


In [43]:
#preds = learn.predict("this film shows incredible talent and is a complete triumph")
preds = learn.predict("this film shows incredibly bad writing and is a complete disaster")

In [44]:
preds

('negative', TensorText(0), TensorText([0.5780, 0.4220]))

In [53]:
TEXT = "I liked this movie because"
N_WORDS = 40
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) 
         for _ in range(N_SENTENCES)]

In [55]:
print("\n".join(preds))

i liked this movie because ' , ' it ' , ' was ' , ' a ' , ' great ' , ' movie ' , ' . ' , ' ' , ' i ' , " ' ve " , '
i liked this movie because ' , ' i ' , ' found ' , ' this ' , ' one ' , ' pretty ' , ' funny ' , ' . ' , ' i ' , ' had ' , ' to


In [35]:
# save the model in /storage/data/wikitext-2/wikitext_tiny_model.pkl
learn.export('imdb_sample_model.pkl')

In [16]:
learn.unfreeze()
learn.fit_one_cycle(5,0.002)

epoch,train_loss,valid_loss,accuracy,time
0,1.27437,1.158648,0.799886,01:13
1,1.185457,1.100074,0.806961,01:14
2,1.112827,1.066841,0.809887,01:14
3,1.071971,1.04788,0.812297,01:14
4,1.038846,1.043908,0.812829,01:14


In [21]:
preds = learn.predict("What comes next", n_words=5)

In [22]:
preds

'What comes next , conventional , low ,'

In [36]:
learn.save("single_epoch")

Path('/storage/data/imdb_sample/models/single_epoch.pth')

In [37]:
learn2 = learn.load('single_epoch')

In [38]:
learn2.unfreeze()
learn2.fit_one_cycle(5, 0.002)

epoch,train_loss,valid_loss,accuracy,time
0,1.300041,1.171857,0.798991,01:17
1,1.179232,1.095705,0.807437,01:19
2,1.117754,1.060938,0.811034,01:18
3,1.076173,1.045368,0.812713,01:18
4,1.051161,1.038763,0.813272,01:18


In [39]:
learn.unfreeze()

In [40]:
learn.fit_one_cycle(5, 0.002)

epoch,train_loss,valid_loss,accuracy,time
0,1.060561,1.051567,0.811446,01:18
1,1.054864,1.043402,0.813158,01:18
2,1.010678,1.034573,0.813989,01:18
3,0.962717,1.02865,0.815437,01:19
4,0.936904,1.03015,0.815121,01:18
