<a href="https://colab.research.google.com/github/Tomawock/NLP_Attack/blob/main/models/studio_2/GMB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Parte nuova testing


In [10]:
!pip install h5py
!pip install transformers



In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import os
import tensorflow as tf
import pandas as pd
import transformers
from transformers import BertTokenizer, TFBertModel, BertConfig
from tensorflow.keras import layers
from tensorflow import keras
from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer
import numpy as np

In [13]:
data_original = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_dataset.tsv"
max_len = 384

In [14]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.SUM
)

# Ignoring loss that is calculated due to padded targets
def masked_ce_loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 17))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

def create_model(num_tags):
    # BERT Base model
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    # NER Model

    # Instantiate Keras tensors
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    # TFBertmodel
    embedding = encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    # Add drop out layer
    embedding = layers.Dropout(0.5)(embedding)
    # Add softmax layer for classifying
    tag_logits = layers.Dense(num_tags+1, activation='softmax')(embedding)
    
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(lr=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss, metrics=['accuracy'])
    return model

In [15]:
#Reference - https://keras.io/examples/nlp/text_extraction_with_bert/
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

In [None]:
parz = pd.read_csv(data_original,sep='\t', encoding="latin-1", index_col=[0])

num_tags = parz["Tag"].nunique()

new_model = create_model(num_tags)
    
new_model.summary()

###Train the model


In [16]:
# Get the sentences
def process_csv(data_path):
    df = pd.read_csv(data_path, encoding="latin-1", na_filter = False)
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag, enc_tag

# Get the sentences
def process_tsv(data_path):
    df =  pd.read_csv(data_path,sep='\t', encoding="latin-1", index_col=[0])
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag, enc_tag

# Prepare the inputs for feeding into model
def create_inputs_targets(data):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "tags": []
    }
    _,ext=os.path.splitext(data)
    if (ext== '.tsv'):
      sentences, tags, tag_encoder = process_tsv(data)
    else:
      sentences, tags, tag_encoder = process_csv(data)
    
    for sentence, tag in zip(sentences, tags):
        
        input_ids = []
        target_tags = []
        for idx, word in enumerate(sentence):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            target_tags.extend([tag[idx]] * num_tokens)
                    
        
        # Pad truncate
        input_ids = input_ids[:max_len - 2]
        target_tags = target_tags[:max_len - 2]

        # Add [CLS] and [SEP]
        input_ids = [101] + input_ids + [102]
        target_tags = [16] + target_tags + [16]
        # token_type_ids does not matter as the task has only one sentence
        token_type_ids = [0] * len(input_ids)
        # Adding attention mask for non-padded input
        attention_mask = [1] * len(input_ids)
        
        # Making all the input enbedding same lenting by padding
        padding_len = max_len - len(input_ids)
        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tags = target_tags + ([17] * padding_len)
        
        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        dataset_dict["tags"].append(target_tags)
        assert len(target_tags) == max_len, f'{len(input_ids)}, {len(target_tags)}'
                
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    # Creating array of input embeddings
    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = dataset_dict["tags"]
    return x, y, tag_encoder

In [17]:
data_original_embedding_train = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_original_embeding_train.csv"

In [None]:
x_train, y_train, tag_encoder = create_inputs_targets(data_original_embedding_train)

bs = 16

new_model.fit(
    x_train,
    y_train,
    epochs=1,
    verbose=1,
    batch_size=bs,
    validation_split=0.1
)

In [19]:
new_model.save_weights("/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_w_studio_2_model.h5")

###Evaluate the model

In [20]:
new_model.load_weights("/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_w_studio_2_model.h5")

In [21]:
datapath_original_embedding_test = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_original_embeding_test.csv"
datapath_original_test = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_original_test.csv"
datapath_sinonimi_test = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_sinonimi_test.csv"
datapath_embedding_test = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_embedding_test.csv"

Original + embedding

In [22]:
#crea i dati per essere elborati dal modello
x_test, y_test, tag_encoder = create_inputs_targets(datapath_original_embedding_test)
pred_test = new_model.evaluate(x_test,y_test,batch_size=16)
#definisce il dizioanrio dell'associazione classi valore numerico



Original


In [23]:
#crea i dati per essere elborati dal modello
x_test, y_test, tag_encoder = create_inputs_targets(datapath_original_test)
pred_test = new_model.evaluate(x_test,y_test,batch_size=16)
#definisce il dizioanrio dell'associazione classi valore numerico



Sinonimi

In [25]:
#crea i dati per essere elborati dal modello
x_test, y_test, tag_encoder = create_inputs_targets(datapath_sinonimi_test)
pred_test = new_model.evaluate(x_test,y_test,batch_size=16)
#definisce il dizioanrio dell'associazione classi valore numerico



Embedding

In [26]:
#crea i dati per essere elborati dal modello
x_test, y_test, tag_encoder = create_inputs_targets(datapath_embedding_test)
pred_test = new_model.evaluate(x_test,y_test,batch_size=16)
#definisce il dizioanrio dell'associazione classi valore numerico

