Code used from this link:

https://medium.com/technonerds/using-fastais-ulmfit-to-make-a-state-of-the-art-multi-label-text-classifier-bf54e2943e83

GitHub Link: 
https://github.com/aditya10/ULMFiT-fastai-text-classifier/blob/master/ULMFiT_tutorial.ipynb



from google.colab import drive
drive.mount('/content/gdrive')

In [2]:
from fastai.text import *
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

###Model set-up

In [7]:
# load train_cleaned.csv

filename = "~/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/nlp/20200504-193926_joe_biden_nlp.csv"
train_language = pd.read_csv(filename, index_col=0)

In [22]:
train_language['text'] = train_language['title'] + " " + train_language['description']

In [27]:
train_language.drop(['title', 'description'], inplace=True, axis=1)

train_language = pd.DataFrame()

train_language['target'] = train['AdoptionSpeed']
train_language['text'] = train['Description']

train_language['target'].replace(to_replace=0, value=1, inplace=True)

In [28]:
train_language.head()

Unnamed: 0,Bias_num,text
1,0,Elon Musk says Australia’s energy emergency is...
2,0,Real life Catch Me If You Can con artist revea...
3,0,Reporter granted rare access inside secretive ...
4,0,The Father's Day Massacre (2014) - Worst bikie...
5,0,WORLD EXCLUSIVE: Harvey Weinstein and his army...


In [34]:
train_language.isna().sum()

Bias_num    0
text        0
dtype: int64

In [33]:
train_language.dropna(inplace=True)

In [None]:
train_language.isna().sum()

In [35]:
#create train and test split
df_trn, df_val = train_test_split(train_language, stratify = train_language['Bias_num'], test_size = 0.3, random_state=42)
df_trn.shape, df_val.shape

((590, 2), (253, 2))

###Pre-processing language model

In [36]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

In [37]:
# Classifier model data
data_clas = TextClasDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "", vocab=data_lm.train_ds.vocab, bs=32)

In [38]:
# check out data
data_clas.show_batch()

text,target
"xxbos xxup exclusive xxup interview : xxmaj robert xxmaj kennedy xxmaj jr. xxmaj destroys xxmaj big xxmaj xxunk , xxmaj fauci & xxmaj pro - xxmaj vaccine xxmaj movement xxmaj robert f xxmaj kennedy xxmaj jr talks vaccines , xxmaj dr xxmaj fauci , family history and jfk assassination with xxmaj patrick xxmaj bet - xxmaj david xxunk for xxmaj xxunk xxmaj health xxmaj defense https : / / bit.ly",1
"xxbos xxunk xxunk xxmaj king xxmaj hussein of xxmaj jordan : xxmaj xxunk of a dynasty | xxmaj al xxmaj jazeera xxmaj world xxmaj the life and struggles of xxmaj king xxmaj hussein of xxmaj jordan , from the assassination of his grandfather to the rise of the xxunk two - part series is the story of xxmaj jordan 's king from 1952 to 1999 , a major political figure",0
"xxbos xxmaj ayaan xxmaj hirsi xxmaj ali on the xxmaj west , xxmaj dawa , and xxmaj islam xxmaj recorded on xxmaj july 12 , xxunk xxmaj hirsi xxmaj ali joins xxmaj peter xxmaj robinson to discuss her new book , xxmaj the xxmaj challenge of xxmaj dawa : xxmaj political xxmaj islam as xxmaj ideology and xxmaj movement and xxmaj how to xxmaj contain xxmaj it , and her",1
"xxbos xxunk xxunk xxmaj egypt 's xxmaj morsi : xxmaj the xxmaj final xxmaj hours | xxmaj al xxmaj jazeera xxmaj world xxmaj the 2011 xxmaj arab xxmaj spring had seen the end of xxmaj president xxmaj hosni xxmaj mubarak 's 30-year rule and within 18 months , xxmaj mohamed xxmaj morsi had become xxmaj egypt 's first democratically elected president . xxmaj but after one year in office ,",0
"xxbos xxunk xxunk xxmaj armenia : xxmaj common xxmaj pain | xxmaj al xxmaj jazeera xxmaj world xxmaj every year on xxmaj april 24 , xxmaj armenians around the world xxunk the anniversary of what they claim to be genocide . xxmaj but 100 years after the events of 1915 , the use of the word "" "" genocide "" "" to describe what happened to the xxmaj armenians at",0


data_clas.save("/content/gdrive/My Drive/language_models/databunch/TextClasDataBunch.pkl")

data_lm.save("/content/gdrive/My Drive/language_models/databunch/TextLMDataBunch.pkl")

### Language model

In [39]:
# train language model
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)

In [40]:
# find the optimal learning rate to train our language model on
learn.lr_find()
learn.recorder.plot(suggestion=True)
min_grad_lr = learn.recorder.min_grad_lr

epoch,train_loss,valid_loss,accuracy,time
0,4.674466,#na#,02:45,


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


KeyboardInterrupt: 

In [42]:
# let’s use this learning rate to train the language model
learn.fit_one_cycle(10, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,4.645583,4.375105,0.311057,03:25


KeyboardInterrupt: 

In [0]:
# unfreezing weights and training the rest of the NN
learn.unfreeze()
learn.fit_one_cycle(2, 1e-3)

Our language model is able to predict the next word with 40% accuracy. This is not bad. 

In [None]:
#save the language model encoder so that we can load it later in our classifier
learn.save_encoder("~/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/nlp/ft_enc")

### Language classifier

In [0]:
#set-up classifier
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[accuracy])
learn.load_encoder('~/Documents/GitHub/esade_fake_news/4_Politics/python3_script/data/nlp/ft_enc')

In [0]:
#train classifier for 10 epochs
learn.fit_one_cycle(10)

#save model weights
learn.save('/content/gdrive/My Drive/language_models/models/language_classifier')

In [0]:
#inspect losses.
learn.recorder.plot_losses()

In [0]:
# unfreezing weights and training the rest of the NN
learn.freeze_to(-2)
learn.fit_one_cycle(4, slice(5e-3, 2e-3), moms=(0.8,0.7))

#export language classifier for prediction
learn.export(file = "/content/gdrive/My Drive/language_models/nlp_export.pkl")

###Confusion matrix

In [0]:
# create confusion matrix
preds,y,losses = learn.get_preds(with_loss=True)
interp = ClassificationInterpretation(learn, preds, y, losses)
interp.plot_confusion_matrix()