<a href="https://colab.research.google.com/github/Tomawock/NLP_Attack/blob/main/models/studio_0/GMB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Parte nuova testing


In [None]:
!pip install h5py
!pip install transformers

In [19]:
import os
import tensorflow as tf
import pandas as pd
import transformers
from transformers import BertTokenizer, TFBertModel, BertConfig
from tensorflow.keras import layers
from tensorflow import keras
from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer
import numpy as np

In [20]:
data_original = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_dataset.tsv"
data_sinonimi = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_sinonimi.csv"
data_embedding = "/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_embedding.csv"
max_len = 384

In [21]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.SUM
)

# Ignoring loss that is calculated due to padded targets
def masked_ce_loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 17))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

def create_model(num_tags):
    # BERT Base model
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    # NER Model

    # Instantiate Keras tensors
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    # TFBertmodel
    embedding = encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    # Add drop out layer
    embedding = layers.Dropout(0.5)(embedding)
    # Add softmax layer for classifying
    tag_logits = layers.Dense(num_tags+1, activation='softmax')(embedding)
    
    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(lr=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss, metrics=['accuracy'])
    return model

In [None]:
parz = pd.read_csv(data_original,sep='\t', encoding="latin-1", index_col=[0])

num_tags = parz["Tag"].nunique()

use_tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    use_tpu = True
except:
    use_tpu = False

if use_tpu:
    # Create distribution strategy
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        new_model = create_model(num_tags)
else:
    new_model = create_model(num_tags)
    
new_model.summary()

In [23]:
new_model.load_weights("/content/drive/Shareddrives/Deep Learning/datasets/GMB/GMB_w_model.h5")

In [95]:
sinonimi=pd.read_csv(data_sinonimi, encoding="latin-1", index_col=[0])
embedding=pd.read_csv(data_embedding, encoding="latin-1", na_filter = False)
original=pd.read_csv(data_original,sep='\t', encoding="latin-1")

In [None]:
sinonimi.head(20)

In [None]:
original.loc[original["Word"] == "NASA", ['Sentence #', 'Word']]

In [None]:
original.loc[original["Sentence #"] == 2555.0, ['Sentence #', 'Word']]

In [None]:
[i for i, parola in enumerate(embedding.Word) if not isinstance(parola, str)]

In [None]:
embedding.iloc[[56409]]

In [None]:
embedding["Sentence #"] == 5553.0

In [None]:
embedding.loc[embedding["Sentence #"] == 5553.0, ['Word']]

In [116]:
# Get the sentences
def process_csv(data_path):
    df = pd.read_csv(data_path, encoding="latin-1", na_filter = False)
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, tag, enc_tag

# Prepare the inputs for feeding into model
def create_inputs_targets(data):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "tags": []
    }
    sentences, tags, tag_encoder = process_csv(data)
    print(sentences[0])
    
    for sentence, tag in zip(sentences, tags):
        
        input_ids = []
        target_tags = []
        for idx, word in enumerate(sentence):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            target_tags.extend([tag[idx]] * num_tokens)
                    
        
        # Pad truncate
        input_ids = input_ids[:max_len - 2]
        target_tags = target_tags[:max_len - 2]

        # Add [CLS] and [SEP]
        input_ids = [101] + input_ids + [102]
        target_tags = [16] + target_tags + [16]
        # token_type_ids does not matter as the task has only one sentence
        token_type_ids = [0] * len(input_ids)
        # Adding attention mask for non-padded input
        attention_mask = [1] * len(input_ids)
        
        # Making all the input enbedding same lenting by padding
        padding_len = max_len - len(input_ids)
        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tags = target_tags + ([17] * padding_len)
        
        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        dataset_dict["tags"].append(target_tags)
        assert len(target_tags) == max_len, f'{len(input_ids)}, {len(target_tags)}'
                
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    # Creating array of input embeddings
    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = dataset_dict["tags"]
    return x, y, tag_encoder

In [None]:
#crea i dati per essere elborati dal modello
x_test, y_test, n_tokens = create_inputs_targets(data_sinonimi)
pred_test = new_model.evaluate(x_test,y_test)
#definisce il dizioanrio dell'associazione classi valore numerico

In [None]:
pred_test

In [None]:
le_dict = dict(zip(tag_encoder.transform(tag_encoder.classes_), tag_encoder.classes_))
result =[]
for i in range(len(test_inputs)):
  pred_tags = np.argmax(pred_test,2)[i][:n_tokens[i]]#trova il vettore delle classsi associato ad ongi stringa in ingresso
  pred_tags = [le_dict.get(_, '[pad]') for _ in pred_tags]
  result.append(pred_tags)