In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string 
import warnings 
import seaborn as sns 
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split

from fastai import *
from fastai.text import *
from pathlib import Path

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [0]:
# Local mode : give the full path to location of data folder

# PATH = "/app/analyse/"

In [0]:
# Google colab mode
# Link to google drive and download data

from google.colab import drive
drive.mount('/content/gdrive/')
# then give the path where your data is stored (in google drive)
PATH = "/content/gdrive/My Drive/ssh_files/nlp"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [0]:
import torch
print("Cuda available" if torch.cuda.is_available() is True else "CPU")
print("PyTorch version: ", torch.__version__)

Cuda available
PyTorch version:  1.1.0


In [0]:
# Check devices
from pynvml import *
nvmlInit()
try:
    deviceCount = nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        print("Device", i, ":", nvmlDeviceGetName(handle))
except NVMLError as error:
    print(error)

Device 0 : b'Tesla K80'


In [0]:
torch.cuda.set_device(0)

# Data preparation

In [0]:
import pandas as pd
# load data from feather file

data = pd.read_feather(f'{PATH}/data/dataset_processed_balanced')
all_data = pd.read_feather(f'{PATH}/data/dataset_processed')

In [0]:
data.tail()

Unnamed: 0,index,text,clean_text,category
34131,126765,Lady Gaga Has Nothing On These Bizarre Marine ...,lady gaga bizarre marine worm marine zoologist...,ENVIRONMENT
34132,151131,U.S. Wildfire Interactive Shows Rising Tempera...,wildfire interactive show rise temperature sno...,ENVIRONMENT
34133,176713,Clean Energy Investment Slows Amid Uncertainty...,clean energy investment slow amid uncertainty ...,ENVIRONMENT
34134,196290,Brazil Navy: Oil Stain Spotted In Chevron Fiel...,brazil navy oil stain spot chevron field compa...,ENVIRONMENT
34135,152383,Baby Gorilla Twins Have An Adorable Playdate I...,baby gorilla twin adorable playdate rwanda vol...,ENVIRONMENT


In [0]:
def lower(text):
    # function to make text lower case
    return text.lower()

In [0]:
data['text_lower'] = data['text'].apply(lower)
all_data['text_lower'] = all_data['text'].apply(lower)

In [0]:
# Remove data from all_data, so we can use the rest as input for training the language model

all_data = all_data.drop(data.index)

In [0]:
all_data.shape

(166717, 4)

In [0]:
data.shape

(34136, 5)

- Split data to train/test

In [0]:
xtrain, xtest, ytrain, ytest = train_test_split(data['text_lower'], data.category, random_state=42,test_size=0.1)

Prepare language model data training and classifier training data


In [0]:
# This data will be use to train language model 

xtrain_all = xtrain.append(all_data['text_lower'])
ytrain_all = ytrain.append(all_data.category)

data_train_lm = pd.DataFrame([ytrain_all, xtrain_all]).T

In [0]:
# Data that will be used for training LM
data_train_lm.to_csv(f'{PATH}/data/train_ulmfit_lm_data.csv', index=False, header=False)

# Data that will be used for training the classifier
data_train_classif = pd.DataFrame([ytrain, xtrain]).T
data_train_classif.to_csv(f'{PATH}/data/train_ulmfit_clas.csv', index=False, header=False)


In [0]:
data_lm = TextLMDataBunch.from_csv(f'{PATH}/data/', 'train_ulmfit_lm_data.csv', min_freq=1, bs=64)

In [0]:
data_lm.show_batch()

idx,text
0,"xxbos daydream believer xxbos when hospitals become factories in recent years , medical educators have warned that hospitals are growing more impersonal and have urged young doctors to treat patients as people -- not diseases . xxbos the one resolution that saves you money and improves your health decreasing waste is environmentally beneficial and , as a bonus , will likely clean up your food choices and save you some"
1,: raising children . the challenges we face day to day are hard enough without being judged by other moms . and this is what these judging mommy blogs and social posts are all about -- the need to feel superior and show people how perfect you are and how imperfect everyone else is . xxbos former mayor busted for alleged child porn xxbos watch : fearless captain chesley '
2,"callous and cruel the statue of liberty is crying , nancy pelosi said . xxbos the manifesting list it can appear as though you are stuck in a vicious cycle or attracting similar situations , over and over ? check out this list on how you create this dynamic and how to gain more control . xxbos eating your way to happiness we all need to eat , but what"
3,"i cheated and it turned my husband on ' more from the stir : 10 things to replace after your divorce 6 things you can never change about a man 15 things a wife never xxbos icymi : quantifying the dad bod and why we need more black doctors xxbos living with hiv : my journey immediately , my mind began racing with questions : when am i going to"
4,"to eat din tai fung founded in taiwan , this dumpling house serves up the best xiao long bao or soup dumplings you ll ever have xxbos jetstar hong kong : qantas and china eastern collaborate on new budget airline "" jetstar 's vision is to make travel more affordable for millions of people across asia , and the demographics of china with xxbos the delicious meal you can make"


# Step1: Fine-tuning the language model

In [0]:
# Make it fp16 for faster training

learn = language_model_learner(data_lm, AWD_LSTM)
learn = learn.to_fp16(clip=0.1)

In [0]:
list(learn.model.children())

[AWD_LSTM(
   (encoder): Embedding(60000, 400, padding_idx=1)
   (encoder_dp): EmbeddingDropout(
     (emb): Embedding(60000, 400, padding_idx=1)
   )
   (rnns): ModuleList(
     (0): WeightDropout(
       (module): LSTM(400, 1152, batch_first=True)
     )
     (1): WeightDropout(
       (module): LSTM(1152, 1152, batch_first=True)
     )
     (2): WeightDropout(
       (module): LSTM(1152, 400, batch_first=True)
     )
   )
   (input_dp): RNNDropout()
   (hidden_dps): ModuleList(
     (0): RNNDropout()
     (1): RNNDropout()
     (2): RNNDropout()
   )
 ), LinearDecoder(
   (decoder): Linear(in_features=400, out_features=60000, bias=True)
   (output_dp): RNNDropout()
 )]

In [0]:
learn.fit_one_cycle(1, 0.005, moms=(0.8,0.7), wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,5.380234,4.977664,0.227547,11:12


In [0]:
learn.unfreeze()

In [0]:
learn.fit_one_cycle(10, 0.0005, moms=(0.8,0.7), wd=0.1)


epoch,train_loss,valid_loss,accuracy,time
0,5.045015,4.788,0.242797,12:18
1,4.900239,4.651919,0.256351,12:22
2,4.777473,4.557629,0.266065,12:21
3,4.67654,4.488118,0.273632,12:20
4,4.574189,4.443034,0.278936,12:19
5,4.510097,4.408674,0.282158,12:20
6,4.458417,4.384754,0.285037,12:24
7,4.41922,4.367712,0.286962,12:20
8,4.385397,4.359671,0.28774,12:22


In [0]:
# Save the fine-tuned encoder
learn.save_encoder('ft_enc_augmented')

# Step2 - Train Classifier

In [0]:
# classifier
# Prepare data for classifier
bs = 128

data_clas = TextClasDataBunch.from_csv(f'{PATH}/data/', 'train_ulmfit_clas.csv', vocab=data_lm.train_ds.vocab,
                                       min_freq=1, bs=bs)

data_clas.save('train_ulmfit_balanced_TextClas')

In [0]:
data_clas.show_batch()

text,target
"xxbos weekend roundup : one year on , the worldpost has 28 million monthly views the worldpost was launched one year ago in davos . it was born out of a contradiction and a paradox . \n \n the contradiction is that while the world is growing more interdependent , the media is fragmenting -- re - nationalizing , re - localizing and even tribalizing . the resulting paradox",WORLDPOST
"xxbos the curse of the occupation at a time when hundreds of rockets are fired by hamas and islamic jihad from gaza against israel -- threatening population centers , including jerusalem and tel aviv -- criticizing israel 's occupation of the west bank would seem inappropriate at best . many israelis justify the continuing occupation in light of the intensifying violence . they argue that israel can not allow the",WORLDPOST
xxbos why ' getting over it ' is a myth you should ignore i had been widowed just over a year and well into my own healing journey when my mother gave me some very wise advice ( which i both follow and dole out to this day ) . she told me to stop and look back at how far i had progressed since that awful season in time,FIFTY
"xxbos 60 better alternatives to aging gracefully i will accept that "" grace "" sounds like a lovely word . after all , who would n't want to be charming and refined ? the problem is that it is both inaccurate and restrictive . it is inaccurate because older women want so much more from life , as the rest of this article will show . it is restrictive ,",FIFTY
"xxbos digital privacy rights upheld in landmark cell phone case the supreme court unanimously ruled today that police may not search information on an arrested suspect 's cell phone without an additional search warrant . in two cases from both coasts , consolidated into a single opinion the court held that the privacy interests in protecting the tremendous amount of personal information stored on cell phones outweighs the government 's",CRIME


In [0]:
learn = text_classifier_learner(data_clas,AWD_LSTM, drop_mult=0.5, pretrained=False)
learn.load_encoder(f'{PATH}/data/models/ft_enc_augmented')
learn.freeze()

In [0]:
metrics = [accuracy, FBeta('macro')]
learn.metrics = metrics 

In [0]:
lr = 1e-1
learn.fit_one_cycle(1,lr, moms=(0.8, 0.7))

epoch,train_loss,valid_loss,accuracy,f_beta,time
0,1.917657,1.591064,0.527909,0.521822,00:20


In [0]:
learn.freeze_to(-2)
lr /= 2
learn.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7), wd=0.1)

epoch,train_loss,valid_loss,accuracy,f_beta,time
0,1.760042,1.479428,0.56371,0.561306,00:24


In [0]:
learn.freeze_to(-3)
lr /= 2
learn.fit_one_cycle(1, slice(lr/(2.6**4),lr), moms=(0.8,0.7), wd=0.1)

epoch,train_loss,valid_loss,accuracy,f_beta,time
0,1.585997,1.390656,0.59105,0.588534,00:35


In [0]:
learn.unfreeze()
lr /= 5
learn.fit_one_cycle(3, slice(lr/(2.6**4),lr/2), moms=(0.8,0.7), wd=0.01)

epoch,train_loss,valid_loss,accuracy,f_beta,time
0,1.202446,1.391886,0.597234,0.59506,00:47
1,1.064578,1.388192,0.606509,0.605875,00:47
2,0.820063,1.419523,0.607486,0.606553,00:47


In [0]:
learn.save("trained_ulmfit_augmented")


In [0]:
learn = text_classifier_learner(data_clas,AWD_LSTM, drop_mult=0.5)

In [0]:
learn.load(f'{PATH}/data/models/trained_ulmfit_augmented')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (24576 items)
x: TextList
xxbos why transgender students belong at women 's colleges sometimes the fear of consequences appears so great that we hold on to tradition much longer than we need to . that was surely the case more than 100 years ago when the revolutionary idea of higher education for women was introduced . we are at another such groundbreaking moment now as women 's colleges face the decision of whether and how to admit transgender women .,xxbos internet raises $ 110,000 to help evicted vet get his home back there 's no place like home .,xxbos doctors learn new tricks to avoid overprescribing antibiotics during cold season offering patients home remedies or over - the - counter drug recommendations can help cut back on unnecessary antibiotics for coughs and colds .,xxbos jimmy kimmel accepts ' tough guy ' roy moore 's challenge to a fight the republican senate candidate and former judge invited kimmel down to alabama to m

In [0]:
# get predictions
preds, targets = learn.get_preds()
predictions = np.argmax(preds, axis=1)
pd.crosstab(predictions, targets)

col_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
0,96,3,3,1,0,1,3,7,0,2,1,4,0,0,1,2,0,1,0,4,1,2,6,1,0,1,2,8,0,4,1,1,0,1
1,3,95,2,0,6,0,4,6,0,1,0,1,1,1,0,4,8,4,0,4,4,2,2,0,3,3,1,0,0,2,0,7,0,1
2,1,0,81,3,0,0,1,1,2,2,2,0,3,3,0,9,0,4,7,0,4,0,2,0,0,0,14,0,0,0,4,6,2,2
3,1,2,0,80,1,1,1,11,0,0,2,5,0,2,0,1,2,7,0,5,2,1,0,2,3,4,4,1,0,6,0,3,0,0
4,0,16,2,1,115,1,2,1,0,0,1,5,4,0,0,1,2,1,1,0,7,1,0,0,2,0,0,1,0,19,0,2,2,2
5,0,1,0,0,1,140,1,2,1,8,0,0,1,2,2,1,0,1,5,4,0,2,2,0,0,2,1,0,11,0,2,3,0,0
6,2,9,3,0,2,0,112,0,0,2,0,1,0,2,0,16,7,3,2,5,2,0,0,2,1,0,3,0,0,2,0,1,0,1
7,9,19,0,15,4,0,2,109,1,1,2,1,0,2,0,2,9,3,0,1,0,13,3,1,5,6,2,2,0,3,1,4,0,0
8,0,0,1,8,3,0,0,1,147,0,0,3,25,2,2,3,0,0,7,3,2,0,0,6,0,0,2,2,0,0,1,0,1,2
9,2,0,2,2,0,3,1,3,0,85,2,7,0,13,0,1,0,0,3,5,0,1,2,1,1,6,0,1,1,1,3,8,0,0


In [0]:
from sklearn.metrics import classification_report

pred = predictions.data.numpy()
t = targets.data.numpy()
print(classification_report(t, pred))

              precision    recall  f1-score   support

           0       0.61      0.56      0.58       172
           1       0.58      0.50      0.54       189
           2       0.53      0.46      0.49       177
           3       0.54      0.48      0.51       168
           4       0.61      0.68      0.64       170
           5       0.73      0.80      0.76       175
           6       0.63      0.63      0.63       178
           7       0.50      0.60      0.54       182
           8       0.67      0.78      0.72       189
           9       0.55      0.46      0.50       186
          10       0.80      0.77      0.78       190
          11       0.50      0.58      0.53       179
          12       0.51      0.46      0.49       181
          13       0.42      0.37      0.39       192
          14       0.78      0.81      0.80       163
          15       0.37      0.33      0.34       175
          16       0.72      0.61      0.66       207
          17       0.70    

In [0]:
## Metrics on test dataset

# Add test dataset to the model
learn.data.add_test(list(xtest))

# Do inference 
preds,y = learn.get_preds(ds_type=DatasetType.Test)

In [0]:
pred = preds.argmax(1).data.numpy()

In [0]:
pred = preds.argmax(1).data.numpy()
y_t = [learn.data.c2i[i] for i in list(ytest)]
print(classification_report(y_t, pred))

              precision    recall  f1-score   support

           0       0.54      0.66      0.59        87
           1       0.57      0.58      0.57        93
           2       0.47      0.39      0.42        98
           3       0.60      0.48      0.54        99
           4       0.68      0.59      0.63        91
           5       0.75      0.80      0.78       100
           6       0.60      0.62      0.61       105
           7       0.63      0.55      0.59       112
           8       0.54      0.69      0.61        93
           9       0.45      0.38      0.41        85
          10       0.69      0.79      0.73        89
          11       0.52      0.51      0.52       100
          12       0.53      0.51      0.52       100
          13       0.47      0.47      0.47       109
          14       0.85      0.83      0.84        98
          15       0.33      0.32      0.32        94
          16       0.70      0.63      0.66        95
          17       0.57    

In [0]:
learn.export('exported_model_balanced_augment')

In [0]:
from random import sample
i=sample(list(xtest.index), 1)[0]
sentence = xtest[i]
#sentence = "kill accident new york"
print(sentence)
print('prediction:', learn.predict(sentence)[0])
print('true value:', ytest[i])

our homes, ourselves and creating the perfect stress-free environment turning your home into a place of balance and harmony is not a new idea, but it's certainly a timeless one. give yourself a gift. apply these small changes, and see what happens to your life.
prediction: HEALTHY LIVING
true value: HEALTHY LIVING
