#**DEEP LEARNING Modeling**

##**Albert**

In [1]:
# Importing necessary libraries

!pip install ktrain
!pip install transformers

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ktrain
from ktrain import text
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import timeit

pd.set_option('display.max_columns', None)
warnings.simplefilter(action="ignore")

import seaborn as sns
plt.style.use('seaborn')

Collecting ktrain
  Downloading ktrain-0.39.0.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syntok>1.3.3 (from ktrain)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika (from ktrain)
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from ktrain)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting k

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd "/content/drive/My Drive/Design Project"

/content/drive/My Drive/Design Project


In [4]:
train_data = pd.read_csv('preprocessed_train_data.csv')
test_data = pd.read_csv('preprocessed_test_data.csv')
valid_data = pd.read_csv('preprocessed_valid_data.csv')

In [5]:
from sklearn.model_selection import train_test_split


# Selecting the features and target variables
X = train_data["text_preprocessed"]
y = train_data["propaganda_label"]

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)

In [6]:
# Recode class labels to 0 and 1
y_train = y_train.map({-1: 0, 1: 1})
y_test = y_test.map({-1: 0, 1: 1})

### **Instantiating a AlBERT Instance:**
- Creating an AlBERT instance with the model name, max token length, the labels to be used for each category and the batch size.

In [7]:
class_label_names = ['Non-Propagandistic', 'Propagandistic']

albert_transformer = text.Transformer('albert-base-v1', maxlen=512, classes=class_label_names, batch_size=6)

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

In [8]:
### Perform Data Preprocessing:

albert_train = albert_transformer.preprocess_train(X_train.to_list(), y_train.to_list())
albert_val = albert_transformer.preprocess_test(X_test.to_list(), y_test.to_list())

preprocessing train...
language: en
train sequence lengths:
	mean : 334
	95percentile : 778
	99percentile : 1293


spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 333
	95percentile : 787
	99percentile : 1320


In [9]:
albert_model = albert_transformer.get_classifier()

In [10]:
albert_learner_ins = ktrain.get_learner(model=albert_model,
                            train_data=albert_train,
                            val_data=albert_val,
                            batch_size=6)

In [11]:
# AlBERT Model Summary:

albert_learner_ins.model.summary()

Model: "tf_albert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  11683584  
                                                                 
 dropout_9 (Dropout)         multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 11685122 (44.58 MB)
Trainable params: 11685122 (44.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
### AlBERT Optimal Learning Rates:¶
'''AlBERT builds on the architecture defined by BERT, hence we can use the established batch sizes and learning rates as used in BERT:

Batch Sizes => {16, 32}
Learning Rates => {1e−5, 2e−5, 3e−5}
We will choose the mean among these for our fine-tuning and evaluation purposes.'''



from sklearn.utils import class_weight


# Get class weights from training data
class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes =np.unique(y_train), y= y_train)

# Create a dictionary to map class index to weight
class_weights_dict = dict(enumerate(class_weights))

print(class_weights_dict)

{0: 0.5629032888819365, 1: 4.4743549891202985}


In [13]:
### Fine Tuning AlBERT

albert_fine_tuning_start= timeit.default_timer()
albert_learner_ins.fit_onecycle(lr=2e-5, epochs=4, class_weight=class_weights_dict)
albert_fine_tuning_stop = timeit.default_timer()



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [14]:
albert_validation_start= timeit.default_timer()
albert_learner_ins.validate()
albert_validation_stop= timeit.default_timer()

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      6394
           1       0.71      0.91      0.80       804

    accuracy                           0.95      7198
   macro avg       0.85      0.93      0.88      7198
weighted avg       0.96      0.95      0.95      7198



In [15]:
print("\nInference time for AlBERT on Emotion dataset: \n", (albert_validation_stop - albert_validation_start), " sec")


Inference time for AlBERT on Emotion dataset: 
 309.138597793999  sec


In [16]:
albert_learner_ins.validate(class_names=class_label_names)

                    precision    recall  f1-score   support

Non-Propagandistic       0.99      0.95      0.97      6394
    Propagandistic       0.71      0.91      0.80       804

          accuracy                           0.95      7198
         macro avg       0.85      0.93      0.88      7198
      weighted avg       0.96      0.95      0.95      7198



array([[6091,  303],
       [  71,  733]])

In [17]:
# Performance on Validation data

X_val = valid_data.text_preprocessed
y_val = valid_data.propaganda_label

# Recode class labels to 0 and 1
y_val = y_val.map({-1: 0, 1: 1})

albert_val = albert_transformer.preprocess_test(X_val.to_list(), y_val.to_list())

preprocessing test...
language: en
test sequence lengths:
	mean : 351
	95percentile : 825
	99percentile : 1353


In [18]:
albert_learner_ins.validate(val_data= albert_val, class_names=class_label_names)

                    precision    recall  f1-score   support

Non-Propagandistic       0.98      0.97      0.97      4550
    Propagandistic       0.77      0.85      0.81       575

          accuracy                           0.95      5125
         macro avg       0.88      0.91      0.89      5125
      weighted avg       0.96      0.95      0.96      5125



array([[4403,  147],
       [  85,  490]])

In [19]:
# Performance on test data

# bert_val = bert_transformer.preprocess_test(X_test.to_list(), y_test.to_list())

X_test_data = test_data.text_preprocessed
y_test_data = test_data.propaganda_label

# Recode class labels to 0 and 1
y_test_data = y_test_data.map({-1: 0, 1: 1})

albert_test_data = albert_transformer.preprocess_test(X_test_data.to_list(), y_test_data.to_list())

preprocessing test...
language: en
test sequence lengths:
	mean : 356
	95percentile : 847
	99percentile : 1410


In [20]:
albert_learner_ins.validate(val_data= albert_test_data, class_names=class_label_names)

                    precision    recall  f1-score   support

Non-Propagandistic       0.98      0.97      0.97      9019
    Propagandistic       0.78      0.84      0.81      1140

          accuracy                           0.96     10159
         macro avg       0.88      0.91      0.89     10159
      weighted avg       0.96      0.96      0.96     10159



array([[8743,  276],
       [ 180,  960]])