In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string 
import warnings 
import seaborn as sns 
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split

from fastai import *
from fastai.text import *
from pathlib import Path

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [0]:
# Local mode : give the full path to location of data folder

# PATH = "/app/analyse/"

In [0]:
# Google colab mode
# Link to google drive and download data

from google.colab import drive
drive.mount('/content/gdrive/')
# then give the path where your data is stored (in google drive)
PATH = "/content/gdrive/My Drive/ssh_files/nlp"

Mounted at /content/gdrive/


Check available GPU devices.

In [0]:
from pynvml import *
nvmlInit()
try:
    deviceCount = nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        print("Device", i, ":", nvmlDeviceGetName(handle))
except NVMLError as error:
    print(error)

Device 0 : b'Tesla K80'


In [0]:
torch.cuda.set_device(0)

# Data Prepartion

Prepare data to be feed to ULMFIT model. 
The data should have two columns : [Targets, Text]

In [0]:
# load data from feather file
import pandas as pd

data = pd.read_feather(f'{PATH}/data/dataset_processed')

In [0]:
data.head()

Unnamed: 0,text,clean_text,category
0,There Were 2 Mass Shootings In Texas Last Week...,mass shooting texas week tv leave husband kill...,CRIME
1,Will Smith Joins Diplo And Nicky Jam For The 2...,smith join diplo nicky jam world cup official ...,ENTERTAINMENT
2,Hugh Grant Marries For The First Time At Age 5...,hugh grant marry time age actor longtime girlf...,ENTERTAINMENT
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,jim carrey blast castrato adam schiff democrat...,ENTERTAINMENT
4,Julianna Margulies Uses Donald Trump Poop Bags...,julianna margulies use donald trump poop bag p...,ENTERTAINMENT


In [0]:
# function to make text lower case
def lower(text):
    return text.lower()

In [0]:
data['text_lower'] = data['text'].apply(lower)

Now we split the data into a training and testing sets. For info, the training dataset will be then split into training and validation sets (this is done inside fast.ai library). The test set will not be used until the end of training/finetuning.

In [0]:
# Hold 10% test data for test

xtrain, xtest, ytrain, ytest = train_test_split(data['text_lower'], data.category, random_state=42,test_size=0.1)

Prepare data for Fast.ai model. First column is target and second is text

In [0]:
data_train = pd.DataFrame([ytrain, xtrain]).T

In [0]:
data_train.shape

(180767, 2)

In [0]:
data_train.head()

Unnamed: 0,category,text_lower
188569,WEDDINGS,'why you're not married' author tracy mcmillan...
71323,GOOD NEWS,the huffington post is hiring an associate goo...
27533,ENTERTAINMENT,'chuck' gives liev schreiber a head start in t...
166154,PARENTS,valentine to a gay brother maybe not in my lif...
78114,RELIGION,the most kick- (satan in the) ass christian ro...


We can save this data for now, so it can be re-used later.

In [0]:
data_train.to_csv(f'{PATH}/data/train_ulmfit_all.csv', index=False, header=False)

Now we load the data for language model finetuning. TextLMDataBunch does a lot of preprocessing under the hood (toknenization using SpaCy and keeping 60,000 most commom tokens for example, default batch size of 64). Take a look at the documentation https://docs.fast.ai/text.data.html#TextLMDataBunch

In [0]:
data_lm = TextLMDataBunch.from_csv(f'{PATH}/data/', 'train_ulmfit_all.csv', min_freq=1, bs=64)

In [0]:
data_lm.show_batch()

idx,text
0,"on a tuesday in mid - october , while sitting in my dermatologists waiting room , i noticed that i had to urinate for what seemed like the thousandth time that morning . and there it was . blood . xxbos meatless monday : the joy , the soy , the ( sub-)culture of tempeh xxbos life lessons from burning man i do n't plan to start burning my possessions"
1,"zings oliver stone for ' fawning ' putin documentary "" does he have your dog in a cage someplace ? "" xxbos carly rae jepsen and bob saget hit the stage to perform the ' full house ' song um , saget does n't even know the words . xxbos emily blunt says it was love at first sight with hubby john krasinski swoon ! xxbos watch a youth team"
2,"inmate 06290 - 177 jared kushner ... and anthony weiner 's new divorce lawyer ! xxbos # trumphistorylecture is revisionist history the way donald likes it "" just ask hannity ! "" xxbos you do n't have to read this blog there are so few things we actually "" have to "" do it 's shocking . realizing we have many more choices than we think we do is absolute"
3,"tested in lymphoma patients who were at high risk of relapse following chemotherapy treatment xxbos seeing happiness in facial expressions , instead of anger , can lessen aggression by "" training "" these volunteers in this way , they started to find happiness in the "" angry "" faces , and they reported feeling xxbos rosa g 's "" king - size candy "" is the halloween anthem we have"
4,"daunting memoir -- i said yes to everything the minute i looked at the new memoir of little actress lyova haskell rosenthal , better known as actress lee grant , i knew i would n't be able to resist i said yes to everything . xxbos rogue planet ' nomads ' may outnumber stars in milky way the researchers used a technique called gravitational microlensing to detect these homeless planets"


#  Step1: Fine-tuning the language model

ULMFit (https://arxiv.org/abs/1801.06146) suggests to use transfer learning for language models. Using a language model initializaed by weights of a pretrained model on Wikitext 103 (https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).

language_model_learner is used to load the pretrained model AWD_LSTM and initialize the model based on language model data data_lm.

In [0]:
# For faster training we use FP16

learn = language_model_learner(data_lm, AWD_LSTM)
learn = learn.to_fp16(clip=0.1)

In [0]:
# Train freezed model

learn.fit_one_cycle(1, 0.005, moms=(0.8,0.7), wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,5.437561,5.006085,0.226467,09:58


In [0]:
# Unfreeze model and train
learn.unfreeze()

In [0]:
learn.fit_one_cycle(10, 0.0005, moms=(0.8,0.7), wd=0.1)


epoch,train_loss,valid_loss,accuracy,time
0,5.075243,4.818161,0.241129,10:55
1,4.930552,4.686686,0.254111,11:02
2,4.815836,4.600497,0.263292,11:01
3,4.721067,4.539413,0.269337,11:00
4,4.637739,4.500144,0.273217,11:00
5,4.580546,4.473127,0.276243,10:59
6,4.517924,4.451406,0.278589,11:02
7,4.489037,4.437113,0.280073,10:59
8,4.43832,4.431052,0.280881,11:03
9,4.428,4.429801,0.28097,11:02


In [0]:
# Save the fine-tuned encoder
learn.save_encoder('ft_enc_all')

# Step2 - Training the classifier

Now we have trained the language model following the steps explained in ULMFit paper and saved the encoder weights. We can train a classifier while re-using the encoder weights.

In [0]:
# Prepare data for classifier

bs = 128 # batch size
data_clas = TextClasDataBunch.from_csv(f'{PATH}/data/', 'train_ulmfit_all.csv', vocab=data_lm.train_ds.vocab,
                                       min_freq=1, bs=bs)

In [0]:
data_clas.show_batch()


text,target
"xxbos sunday roundup this week the nation watched as the # nevertrump movement folded faster than one of the presumptive nominee 's beachfront developments . as many tried to explain away trump 's reckless , racist extremism , a few put principle over party . the wife of former republican senator bob bennett , who died on may 4 , revealed that her husband spent his dying hours reaching out",POLITICS
"xxbos weekend roundup : how will greece take its hemlock ? ancient greece was not only the birthplace of democracy , but also a deathbed of reason when a jury of 500 citizens condemned socrates to die by hemlock poisoning for his xxunk attitude toward the order of the day . defiant to the end , the philosopher voluntarily drank the poison himself in a suicidal display of dignity .",WORLDPOST
"xxbos the best gifts i can give my children to ensure their success it was under the tree house where my siblings and i would play restaurant , serving up birdseed soup and mud pies . it was in the big red barn that we would play hide - and - seek . it was on the tire swing where we would shout out made - up songs . it",PARENTS
xxbos why ' getting over it ' is a myth you should ignore i had been widowed just over a year and well into my own healing journey when my mother gave me some very wise advice ( which i both follow and dole out to this day ) . she told me to stop and look back at how far i had progressed since that awful season in time,FIFTY
"xxbos sentenced to life in prison , this man now has great sf tech job every day , men and women are released from prisons and jails across the u.s . after taking this same journey . most incarcerated settings do not provide programs that teach relevant job skills for reentry to society . as a result , recidivism -- the rate at which people return to incarceration -- is",IMPACT


In [0]:
data_clas.save('data_clas.pkl')

In [0]:
data_clas = load_data(PATH, 'data/data_clas.pkl', bs=bs)

Define the text classifier and load the encoder weights.

In [0]:
# Define the classifier and load pre-trained encoder

learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, pretrained=False)
#learn.load_encoder('/content/gdrive/My Drive/ssh_files/nlp/data/models/ft_enc_balance_5')
learn.load_encoder('/app/analyse/data/models/ft_enc_all')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (144612 items)
x: TextList
xxbos 5 things you should know about schools in 2015 some things to know as students , parents and teachers embark on a new school year .,xxbos from eve of destruction to already gone : conversations with jack tempchin and p.f . sloan , plus a xxunk track jack tempchin : " then the whole music thing happened with bob dylan and the folk era and eventually i started being a blues harmonica player , then i moved in to being a guitar playing songwriter and then full - on head or what they call now a ' hippie . ' ",xxbos isabel celis : father of missing arizona girl barred from contact with sons ( video ) isabel celis ' father barred from seeing his sons see a timeline of events in the isabel celis case according to a statement,xxbos how to cook meat faster -- and take out your frustration have you had one of those days -- or even one of those weeks -- when you just want to hit something ? and to add to your fru

Do training by stages (unfreezing layers and train for few cycles). 

In [0]:
lr = 1e-1
learn.fit_one_cycle(1, lr, moms=(0.8,0.7), wd=0.1)


epoch,train_loss,valid_loss,accuracy,time
0,1.40301,1.223766,0.64441,02:10


In [0]:
learn.freeze_to(-2)
lr /= 2
learn.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7), wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,1.290475,1.115497,0.680893,02:36


In [0]:
learn.freeze_to(-3)
lr /= 2
learn.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7), wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,1.14607,1.024985,0.700393,03:40


In [0]:
learn.unfreeze()
lr /= 5
learn.fit_one_cycle(3, slice(lr/(2.6**4),lr/2), moms=(0.8,0.7), wd=0.01)

epoch,train_loss,valid_loss,accuracy,time
0,1.049214,0.996718,0.705814,04:45
1,0.959705,0.954482,0.717735,05:17
2,0.754667,0.959679,0.721442,04:47


In [0]:
# Save weights.

learn.save("trained_model_all")

In [0]:
# Reload model and use pretrained weights

learn = text_classifier_learner(data_clas,AWD_LSTM, drop_mult=0.5)

In [0]:
learn.load(f'{PATH}/models/trained_model_all')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (144612 items)
x: TextList
xxbos 5 things you should know about schools in 2015 some things to know as students , parents and teachers embark on a new school year .,xxbos from eve of destruction to already gone : conversations with jack tempchin and p.f . sloan , plus a xxunk track jack tempchin : " then the whole music thing happened with bob dylan and the folk era and eventually i started being a blues harmonica player , then i moved in to being a guitar playing songwriter and then full - on head or what they call now a ' hippie . ' ",xxbos isabel celis : father of missing arizona girl barred from contact with sons ( video ) isabel celis ' father barred from seeing his sons see a timeline of events in the isabel celis case according to a statement,xxbos how to cook meat faster -- and take out your frustration have you had one of those days -- or even one of those weeks -- when you just want to hit something ? and to add to your fru

In [0]:
# get predictions
preds, targets = learn.get_preds()
predictions = np.argmax(preds, axis=1)
pd.crosstab(predictions, targets)

col_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
0,413,9,6,11,1,2,4,63,0,5,3,4,2,4,5,8,3,4,0,14,7,13,5,4,2,11,2,25,0,8,8,3,0,7
1,6,441,9,7,24,1,10,73,0,0,2,7,0,6,1,16,7,5,1,11,64,9,8,1,8,2,2,2,0,3,1,10,2,1
2,4,6,610,13,6,0,6,10,4,5,2,5,9,28,4,30,1,12,52,10,78,2,2,5,3,3,39,4,0,3,25,15,3,9
3,10,8,7,539,0,0,0,91,2,2,10,9,2,8,4,1,1,15,1,7,38,12,2,2,3,9,12,0,0,26,2,11,1,2
4,0,31,8,0,421,1,2,15,3,1,0,4,6,2,0,1,1,1,2,8,47,4,2,2,1,1,3,2,2,58,1,4,3,13
5,0,2,0,3,1,497,1,7,0,5,1,0,0,6,2,1,0,1,1,12,2,2,0,0,0,3,2,1,24,2,18,12,0,1
6,4,9,7,1,4,0,202,1,0,0,1,0,0,2,2,17,2,3,4,17,31,2,5,0,8,3,4,3,0,1,6,8,1,1
7,67,102,9,117,13,2,5,2359,1,11,8,10,4,20,2,4,22,28,1,35,49,58,10,5,39,75,10,12,2,19,7,37,2,6
8,2,3,3,4,4,0,0,1,122,0,0,5,18,0,1,3,0,0,2,4,4,0,0,7,0,1,1,11,0,0,11,0,0,0
9,1,1,3,1,0,0,1,0,0,69,1,1,0,6,0,0,0,0,3,7,0,1,0,0,0,2,0,3,0,0,3,6,0,0


In [0]:
from sklearn.metrics import classification_report

pred = predictions.data.numpy()
t = targets.data.numpy()
print(classification_report(t, pred))

              precision    recall  f1-score   support

           0       0.63      0.62      0.62       670
           1       0.60      0.52      0.56       848
           2       0.61      0.56      0.58      1096
           3       0.64      0.55      0.59       986
           4       0.65      0.65      0.65       650
           5       0.82      0.81      0.81       617
           6       0.58      0.53      0.55       379
           7       0.75      0.79      0.77      2999
           8       0.59      0.49      0.54       249
           9       0.63      0.27      0.38       254
          10       0.82      0.89      0.85      1476
          11       0.48      0.34      0.40       251
          12       0.51      0.56      0.53       474
          13       0.61      0.41      0.49      1240
          14       0.84      0.81      0.83       759
          15       0.50      0.36      0.42       638
          16       0.66      0.43      0.52       187
          17       0.64    

In [0]:
# Add test dataset to the model
learn.data.add_test(list(xtest))

# Do inference 
preds,y = learn.get_preds(ds_type=DatasetType.Test)

In [0]:
pred = preds.argmax(1).data.numpy()
y_t = [learn.data.c2i[i] for i in list(ytest)]
print(classification_report(y_t, pred))

              precision    recall  f1-score   support

           0       0.67      0.59      0.62       410
           1       0.58      0.50      0.54       451
           2       0.59      0.54      0.57       557
           3       0.62      0.56      0.59       523
           4       0.63      0.69      0.66       327
           5       0.82      0.81      0.81       350
           6       0.60      0.53      0.56       208
           7       0.75      0.81      0.78      1602
           8       0.70      0.57      0.63       138
           9       0.59      0.28      0.38       146
          10       0.82      0.87      0.84       838
          11       0.57      0.35      0.44       131
          12       0.53      0.58      0.55       274
          13       0.59      0.43      0.50       679
          14       0.84      0.79      0.81       415
          15       0.42      0.33      0.37       323
          16       0.57      0.40      0.47        97
          17       0.64    

# Exporting the model

In [0]:
learn.export('exported_model_all')

We can compute top_n accuracy on the testing dataset using the following function

In [0]:
#both preds and truths are same shape m by n (m is number of predictions and n is number of classes)
def top_n_accuracy(preds, ts, n):
    best_n = np.argsort(preds, axis=1)[:,-n:]
    
    successes = 0
    for i in range(ts.shape[0]):
        if ts[i] in best_n[i,:]:
            successes += 1
    return float(successes)/ts.shape[0]

In [0]:
#Top 3
p = preds.data.numpy()
top_n_accuracy(preds,np.array(y_t), 3)

0.9044608184805337

In [0]:
from random import sample
i=sample(list(xtest.index), 1)[0]
sentence = xtest[i]
#sentence = "stone man call cop mistake dog bite gunshot wind believe shoot subsequently call police"
print(sentence)
print('prediction:', learn.predict(sentence)[0])
print('true value:', ytest[i])

how to make homemade chicken stock chicken stock basics basic stock is what you'll use most in your cooking. it's basically that pale yellow or golden broth
prediction: FOOD & DRINK
true value: FOOD & DRINK


In [0]:
# Save testing dataset for inference 
data_inference = pd.DataFrame([ytest, xtest]).T

data_inference = data_inference.reset_index()
data_inference.to_feather(f'{PATH}/data/dataset_inference')