<a href="https://colab.research.google.com/github/Tomawock/NLP_Attack/blob/main/models/documented_model/GMB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [3]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import csv

import matplotlib.pyplot as plt
%matplotlib inline

from prettytable import PrettyTable

import os
import re
import json
import string
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
import datetime

from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback


%tensorflow_version 2.x
import tensorflow as tf
import os
print(tf.__version__)

transformers.__version__

2.4.1


'4.3.2'

In [23]:
# Get the sentences
def process_csv(data_path):
    df = pd.read_csv(data,sep='\t', encoding="latin-1", index_col=[0])
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag, enc_tag

# Prepare the inputs for feeding into model
def create_inputs_targets(data):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "tags": []
    }
    sentences, tags, tag_encoder = process_csv(data)
    print(sentences[0])
    
    for sentence, tag in zip(sentences, tags):
        
        input_ids = []
        target_tags = []
        for idx, word in enumerate(sentence):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            target_tags.extend([tag[idx]] * num_tokens)
                    
        
        # Pad truncate
        input_ids = input_ids[:max_len - 2]
        target_tags = target_tags[:max_len - 2]

        # Add [CLS] and [SEP]
        input_ids = [101] + input_ids + [102]
        target_tags = [16] + target_tags + [16]
        # token_type_ids does not matter as the task has only one sentence
        token_type_ids = [0] * len(input_ids)
        # Adding attention mask for non-padded input
        attention_mask = [1] * len(input_ids)
        
        # Making all the input enbedding same lenting by padding
        padding_len = max_len - len(input_ids)
        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tags = target_tags + ([17] * padding_len)
        
        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        dataset_dict["tags"].append(target_tags)
        assert len(target_tags) == max_len, f'{len(input_ids)}, {len(target_tags)}'
                
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    # Creating array of input embeddings
    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = dataset_dict["tags"]
    return x, y, tag_encoder

In [6]:
#Read data path
data = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_dataset.tsv"
max_len = 384
configuration = BertConfig()

In [None]:
#Reference - https://keras.io/examples/nlp/text_extraction_with_bert/
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

In [8]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.SUM
)

# Ignoring loss that is calculated due to padded targets
def masked_ce_loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 17))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

def create_model(num_tags):
    # BERT Base model
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    # NER Model

    # Instantiate Keras tensors
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    # TFBertmodel
    embedding = encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    # Add drop out layer
    embedding = layers.Dropout(0.5)(embedding)
    # Add softmax layer for classifying
    tag_logits = layers.Dense(num_tags+1, activation='softmax')(embedding)
    
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(lr=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss, metrics=['accuracy'])
    return model

In [11]:
parz = pd.read_csv(data,sep='\t', encoding="latin-1", index_col=[0])

num_tags = parz["Tag"].nunique()

use_tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    use_tpu = True
except:
    use_tpu = False

if use_tpu:
    # Create distribution strategy
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model(num_tags)
else:
    model = create_model(num_tags)
    
model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_1 (TFBertModel)   TFBaseModelOutputWit 109482240   input_4[0][0]                    
                                                                 input_6[0][0]              

In [24]:
x_train, y_train, tag_encoder = create_inputs_targets(data)

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']


In [None]:
x_train[0][0]

In [16]:
x_train, y_train, tag_encoder = create_inputs_targets(data)

bs = 64 if use_tpu else 16

model.fit(
    x_train,
    y_train,
    epochs=1,
    verbose=1,
    batch_size=bs,
    validation_split=0.1
)



<tensorflow.python.keras.callbacks.History at 0x7f8fb7c57710>

In [135]:
def create_inputs_targets_from_text(data):  
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "tags": []
    }
    n_tokens=[]
    for sen in data:
        print(sen)
        sentence = sen.split(" ")
        input_ids = []
        
        #target_tags = []
        print(sentence)
        for idx, word in enumerate(sentence):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            
        # Pad truncate
        input_ids = input_ids[:max_len - 2]

        # Add [CLS] and [SEP]
        input_ids = [101] + input_ids + [102]
        n_tokens.append(len(input_ids)) 
        # token_type_ids does not matter as the task has only one sentence
        token_type_ids = [0] * len(input_ids)
        # Adding attention mask for non-padded input
        attention_mask = [1] * len(input_ids)
        
        # Making all the input enbedding same lenting by padding
        padding_len = max_len - len(input_ids)
        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        #target_tags = target_tags + ([17] * padding_len)
        
        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        #assert len(target_tags) == max_len, f'{len(input_ids)}, {len(target_tags)}'
                
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    # Creating array of input embeddings
    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    #y = dataset_dict["tags"]
    return x, dataset_dict, n_tokens

In [171]:
# test input sentence
test_inputs = ["John Kennedy was born in New York in august","Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country ."]
#crea i dati per essere elborati dal modello
x_test, y_test, n_tokens = create_inputs_targets_from_text(test_inputs)

pred_test = model.predict(x_test)
#definisce il dizioanrio dell'associazione classi valore numerico
le_dict = dict(zip(tag_encoder.transform(tag_encoder.classes_), tag_encoder.classes_))
result =[]
for i in range(len(test_inputs)):
  pred_tags = np.argmax(pred_test,2)[i][:n_tokens[i]]#trova il vettore delle classsi associato ad ongi stringa in ingresso
  pred_tags = [le_dict.get(_, '[pad]') for _ in pred_tags]
  result.append(pred_tags)

John Kennedy was born in New York in august
['John', 'Kennedy', 'was', 'born', 'in', 'New', 'York', 'in', 'august']
Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']


In [172]:
print(result[0])
print(result[1])

['O', 'I-per', 'I-per', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'B-tim', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O']
