#**DEEP LEARNING Modeling**

###**BERT**

In [1]:
# Importing necessary libraries

!pip install ktrain
!pip install transformers

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ktrain
from ktrain import text
import tensorflow as tf
from sklearn.model_selection import train_test_split
import timeit
import warnings

pd.set_option('display.max_columns', None)
warnings.simplefilter(action="ignore")

import seaborn as sns
plt.style.use('seaborn')

Collecting ktrain
  Downloading ktrain-0.38.0.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cchardet (from ktrain)
  Downloading cchardet-2.1.7.tar.gz (653 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m653.6/653.6 kB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syntok>1.3.3 (from ktrain)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika (from ktrain)
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transfo

In [2]:
tf.__version__

'2.14.0'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd "/content/drive/My Drive/Design Project"

/content/drive/My Drive/Design Project


In [5]:
train_data = pd.read_csv('preprocessed_train_data.csv')
test_data = pd.read_csv('preprocessed_test_data.csv')
valid_data = pd.read_csv('preprocessed_valid_data.csv')

In [6]:
# Selecting the features and target variables
X = train_data["text_preprocessed"]
y = train_data["propaganda_label"]

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)

In [7]:
y.unique()

array([-1,  1])

In [8]:
# Recode class labels to 0 and 1
y_train = y_train.map({-1: 0, 1: 1})
y_test = y_test.map({-1: 0, 1: 1})

In [9]:
y_train.unique()

array([0, 1])

**Instantiating a BERT Instance:**
 - Creating a BERT instance with the model name, max token length, the labels to be used for each category and the batch size.

In [10]:
class_label_names = ['Non-Propagandistic', 'Propagandistic']

bert_transformer = text.Transformer('bert-base-uncased', maxlen=512, classes=class_label_names, batch_size=16)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

##Data pre-processing

In [11]:
bert_train = bert_transformer.preprocess_train(X_train.to_list(), y_train.to_list())
bert_val = bert_transformer.preprocess_test(X_test.to_list(), y_test.to_list())

preprocessing train...
language: en
train sequence lengths:
	mean : 334
	95percentile : 778
	99percentile : 1293


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 333
	95percentile : 787
	99percentile : 1320


In [12]:
### Compiling BERT in a K-Train Learner Object:
#Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation


bert_model = bert_transformer.get_classifier()

bert_learner_ins = ktrain.get_learner(model=bert_model,
                            train_data=bert_train,
                            val_data=bert_val,
                            batch_size=10)

In [13]:
### BERT Model Summary:

bert_learner_ins.model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**BERT Optimal Learning Rates:**

As per the research paper on BERT, below are the details for optimal learning rates:

- Batch Sizes => {16, 32}
- Learning Rates => {1e−5, 2e−5, 3e−5}

### **Fine Tuning BERT on the Proppy Dataset:**
- We take our fake news dataset along with the BERT model we created and define the learning-rate & epochs to be used and start fine-tuning.

In [14]:
from sklearn.utils import class_weight


# Get class weights from training data
class_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes =np.unique(y_train), y= y_train)

# Create a dictionary to map class index to weight
class_weights_dict = dict(enumerate(class_weights))

print(class_weights_dict)

{0: 0.5629032888819365, 1: 4.4743549891202985}


In [15]:
%%time

bert_fine_tuning_start= timeit.default_timer()
bert_learner_ins.fit_onecycle(lr=2e-5, epochs=5, class_weight=class_weights_dict)
bert_fine_tuning_stop = timeit.default_timer()



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 2h 49min 5s, sys: 33min 11s, total: 3h 22min 17s
Wall time: 5h 15min 42s


In [16]:
### Checking BERT performance metrics on training data:

# %%time

bert_validation_start= timeit.default_timer()
bert_learner_ins.validate()
bert_validation_stop= timeit.default_timer()

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      6394
           1       0.90      0.81      0.86       804

    accuracy                           0.97      7198
   macro avg       0.94      0.90      0.92      7198
weighted avg       0.97      0.97      0.97      7198



In [17]:
print("\nInference time for BERT on Proppy dataset: \n", (bert_validation_stop - bert_validation_start), " sec")


Inference time for BERT on Proppy dataset: 
 309.61988649699924  sec


In [18]:
bert_learner_ins.validate(class_names=class_label_names)

                    precision    recall  f1-score   support

Non-Propagandistic       0.98      0.99      0.98      6394
    Propagandistic       0.90      0.81      0.86       804

          accuracy                           0.97      7198
         macro avg       0.94      0.90      0.92      7198
      weighted avg       0.97      0.97      0.97      7198



array([[6322,   72],
       [ 149,  655]])

In [19]:
# Performance on Validation data

X_val = valid_data.text_preprocessed
y_val = valid_data.propaganda_label

# Recode class labels to 0 and 1
y_val = y_val.map({-1: 0, 1: 1})

bert_val = bert_transformer.preprocess_test(X_val.to_list(), y_val.to_list())


preprocessing test...
language: en
test sequence lengths:
	mean : 351
	95percentile : 825
	99percentile : 1353


In [20]:
bert_learner_ins.validate(val_data=bert_val, class_names=class_label_names)

                    precision    recall  f1-score   support

Non-Propagandistic       0.98      0.99      0.98      4550
    Propagandistic       0.90      0.82      0.85       575

          accuracy                           0.97      5125
         macro avg       0.94      0.90      0.92      5125
      weighted avg       0.97      0.97      0.97      5125



array([[4495,   55],
       [ 105,  470]])

In [21]:
# Performance on test data

X_test_data = test_data.text_preprocessed
y_test_data = test_data.propaganda_label

# Recode class labels to 0 and 1
y_test_data = y_test_data.map({-1: 0, 1: 1})

bert_test_data = bert_transformer.preprocess_test(X_test_data.to_list(), y_test_data.to_list())

preprocessing test...
language: en
test sequence lengths:
	mean : 356
	95percentile : 847
	99percentile : 1410


In [22]:
bert_learner_ins.validate(val_data=bert_test_data, class_names=class_label_names)

                    precision    recall  f1-score   support

Non-Propagandistic       0.98      0.99      0.98      9019
    Propagandistic       0.88      0.81      0.84      1140

          accuracy                           0.97     10159
         macro avg       0.93      0.90      0.91     10159
      weighted avg       0.96      0.97      0.97     10159



array([[8893,  126],
       [ 222,  918]])

#**Distil BERT**

**Instantiating a Distil BERT Instance:**

- Creating a Distil BERT instance with the model name, max token length, the labels to be used for each category and the batch size.

In [10]:
class_label_names = ['Non-Propagandistic', 'Propagandistic']

distilbert_transformer = text.Transformer('distilbert-base-uncased', maxlen=512, classes=class_label_names, batch_size=6)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [11]:
### Data Preprocessing:

distilbert_train = distilbert_transformer.preprocess_train(X_train.to_list(), y_train.to_list())
distilbert_val = distilbert_transformer.preprocess_test(X_test.to_list(), y_test.to_list())

preprocessing train...
language: en
train sequence lengths:
	mean : 334
	95percentile : 778
	99percentile : 1293


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 333
	95percentile : 787
	99percentile : 1320


In [12]:
distilbert_model = distilbert_transformer.get_classifier()

distilbert_learner_ins = ktrain.get_learner(model=distilbert_model,
                            train_data=distilbert_train,
                            val_data=distilbert_val,
                            batch_size=6)

In [13]:
# DISTILBERT Model Summary:

distilbert_learner_ins.model.summary()

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 66955010 (255.41 MB)
Trainable params: 66955010 (255.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
distilbert_fine_tuning_start= timeit.default_timer()
distilbert_learner_ins.fit_onecycle(lr=2e-5, epochs=6)
distilbert_fine_tuning_stop = timeit.default_timer()



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [15]:
print("\nFine-Tuning time for DistilBERT on Emotion dataset: \n", (distilbert_fine_tuning_stop - distilbert_fine_tuning_start)/60, " min")


Fine-Tuning time for DistilBERT on Emotion dataset: 
 194.53825360855  min


In [16]:
distilbert_validation_start= timeit.default_timer()
distilbert_learner_ins.validate()
distilbert_validation_stop= timeit.default_timer()

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      6394
           1       0.92      0.79      0.85       804

    accuracy                           0.97      7198
   macro avg       0.94      0.89      0.91      7198
weighted avg       0.97      0.97      0.97      7198



In [17]:
print("\nInference time for DistilBERT on Emotion dataset: \n", (distilbert_validation_stop - distilbert_validation_start), " sec")


Inference time for DistilBERT on Emotion dataset: 
 204.07491658599974  sec


In [18]:
distilbert_learner_ins.validate(class_names=class_label_names)

                    precision    recall  f1-score   support

Non-Propagandistic       0.97      0.99      0.98      6394
    Propagandistic       0.92      0.79      0.85       804

          accuracy                           0.97      7198
         macro avg       0.94      0.89      0.91      7198
      weighted avg       0.97      0.97      0.97      7198



array([[6336,   58],
       [ 171,  633]])

In [19]:
# Performance on Validation data

X_val = valid_data.text_preprocessed
y_val = valid_data.propaganda_label

# Recode class labels to 0 and 1
y_val = y_val.map({-1: 0, 1: 1})

distilbert_val = distilbert_transformer.preprocess_test(X_val.to_list(), y_val.to_list())

preprocessing test...
language: en
test sequence lengths:
	mean : 351
	95percentile : 825
	99percentile : 1353


In [20]:
distilbert_learner_ins.validate(val_data = distilbert_val, class_names=class_label_names)

                    precision    recall  f1-score   support

Non-Propagandistic       0.98      0.98      0.98      4550
    Propagandistic       0.82      0.85      0.83       575

          accuracy                           0.96      5125
         macro avg       0.90      0.91      0.91      5125
      weighted avg       0.96      0.96      0.96      5125



array([[4442,  108],
       [  88,  487]])

In [21]:
# Performance on test data

X_test_data = test_data.text_preprocessed
y_test_data = test_data.propaganda_label

# Recode class labels to 0 and 1
y_test_data = y_test_data.map({-1: 0, 1: 1})

distilbert_test_data = distilbert_transformer.preprocess_test(X_test_data.to_list(), y_test_data.to_list())

preprocessing test...
language: en
test sequence lengths:
	mean : 356
	95percentile : 847
	99percentile : 1410


In [22]:
distilbert_learner_ins.validate(val_data=distilbert_test_data, class_names=class_label_names)

                    precision    recall  f1-score   support

Non-Propagandistic       0.98      0.98      0.98      9019
    Propagandistic       0.83      0.83      0.83      1140

          accuracy                           0.96     10159
         macro avg       0.90      0.90      0.90     10159
      weighted avg       0.96      0.96      0.96     10159



array([[8821,  198],
       [ 197,  943]])

##**Result**