#Importing the libraries

In [None]:
!pip3 install ktrain

Collecting ktrain
  Downloading ktrain-0.37.6.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cchardet (from ktrain)
  Downloading cchardet-2.1.7.tar.gz (653 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m653.6/653.6 kB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syntok>1.3.3 (from ktrain)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika (from ktrain)
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformer

In [None]:
import numpy as np
import os.path
import tensorflow as tf
import ktrain  #ktrain is the python library that has been used to implement the BERT model
from ktrain import text #the text module will allow us to do the text processing and text wrapping into the BERT model

#Data Preprocessing

##Loading the IMDB dataset

In [None]:
#get_file function has been used to download a file from the specified URL(Stanford AI website here), as the file is not already in the cache
#fname - Name of the file
#origin - URL of the file
#extract = True - makes sure that the file is extracted as an archive, such as tar or zip
dataset = tf.keras.utils.get_file(fname = "aclImdb_v1.tar.gz",
                                  origin = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                                  extract = True)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
#the text module of ktrain library contains texts_from_folder function, which we will use to access the dataset
#texts_from_folder function does not accept the dataset as it is, it accepts the directory folder path leading to that dataset

#thus, getting the directory folder path leading to the IMDB dataset
IMDB_DIRECTORY = os.path.join(os.path.dirname(dataset), 'aclImdb') #joining must be done as the dataset itself is contained in the aclImdb folder

In [None]:
print(os.path.dirname(dataset))
print(IMDB_DIRECTORY)

/root/.keras/datasets
/root/.keras/datasets/aclImdb


##Train-Test Split

####Now, we create the training and testing sets
####text module of ktrain library contains the data.py file, which contains the texts_from_folder function. This function will allow us to make the train-test split.
####In train_test_names, we put the names of the directories that contain the training and testing data, i.e. 'train' and 'test' in this case.

####Since we are implementing a BERT model, we choose BERT tokenization and preprocessing as the preprocessing mode


In [None]:
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(datadir = IMDB_DIRECTORY,
                                                                       classes = ['pos','neg'], #binary classification problem in this case
                                                                       maxlen = 500,
                                                                       train_test_names = ['train','test'],
                                                                       preprocess_mode = 'bert')

detected encoding: utf-8
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


#Building the BERT model

####models.py is contained inside the text module of ktrain library.

####We will use the text_classifier function in models.py to build a text classification model.

In [None]:
model = text.text_classifier(name = 'bert',
                             train_data = (x_train, y_train),
                             preproc = preproc #preproc variable was returned above by the texts_from_folder function of text module
                             )

Is Multi-Label? False
maxlen is 500




done.


#Fine-tuning and Evaluating the BERT model

####____init____.py file in the ktrain library contains get_learner function, which will provide us with the learner instance of the BERT model. This instance will allow us to tune and train our BERT model.

####fit_onecycle function is used to run the final training with learner instance of BERT model

In [None]:
#gettin learner instance of our BERT model
learner = ktrain.get_learner(model = model,
                             train_data = (x_train, y_train),
                             val_data = (x_test, y_test),
                             batch_size = 6 #for a maximum sequence length of 500, we choose a batch_size of 6
                             )

####fit_onecycle function uses a one cycle policy callback

In [None]:
learner.fit_onecycle(lr=2e-5,
                     epochs=1)



begin training using onecycle policy with max lr of 2e-05...


<keras.callbacks.History at 0x7e99e8019060>

####With just a single epoch, a validation accuracy of around 94% achieved !

In [11]:
print(learner.model)

<keras.engine.functional.Functional object at 0x7e99e806f220>


In [12]:
print(preproc)

<ktrain.text.preprocessor.BERTPreprocessor object at 0x7e99ee82b220>


In [13]:
predictor = ktrain.get_predictor(learner.model,preproc)

In [14]:
print(predictor)

<ktrain.text.predictor.TextPredictor object at 0x7e9940620f10>


In [44]:
testing = ['this movie is fantasic',
        'this movie is so so, they could have improved the direction',
        'there are many areas in the film, where it became slow moving',
        'this movie was horrible, the plot was really boring. acting was okay',
        'the film is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [46]:
predictor.predict(testing)


['pos', 'neg', 'neg', 'neg', 'neg', 'pos']

In [49]:
# (return_proba = True) - provides prediction probability for each class
predictor.predict(data, return_proba=True)

array([[0.12790589, 0.87209415],
       [0.7573618 , 0.24263817],
       [0.9295372 , 0.07046288],
       [0.9968828 , 0.00311729],
       [0.9971234 , 0.00287661],
       [0.00775604, 0.99224395]], dtype=float32)

In [24]:
test_data = os.path.join(IMDB_DIRECTORY, 'test')
test_data_pos = os.path.join(test_data, 'pos')
print(test_data_pos)

/root/.keras/datasets/aclImdb/test/pos


In [51]:
# (return_proba = True) - provides prediction probability for each class
import random
import os
for filename in random.sample(os.listdir(test_data_pos), 10):
    print(predictor.predict(filename))
    print(predictor.predict(filename, return_proba=True))

pos
[0.24632311 0.7536769 ]
pos
[0.23097813 0.7690219 ]
pos
[0.34494516 0.6550548 ]
pos
[0.3545825 0.6454176]
pos
[0.17682491 0.82317513]
pos
[0.15962026 0.8403797 ]
pos
[0.19982587 0.8001741 ]
pos
[0.26418447 0.7358155 ]
pos
[0.24013752 0.7598625 ]
pos
[0.25876114 0.74123883]
