In [1]:
import pandas as pd

import tensorflow as tf
import tensorflow_hub as tf_hub
import wandb

import tensorflow as tf
import tensorflow_text
import nltk
from lime.lime_text import LimeTextExplainer
import numpy as np
from train_wandb import Trainer

In [2]:
labeled_df = pd.read_excel("../../data/labeled_dataset.xlsx").rename(columns={"Unnamed: 0": "id"})
minified_df = labeled_df[["sentence", "Label_opinion"]]
minified_df["target"] = (minified_df["Label_opinion"] == "Expresses writer’s opinion").astype(int)
minified_df = minified_df.drop("Label_opinion", axis=1)
minified_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  minified_df["target"] = (minified_df["Label_opinion"] == "Expresses writer’s opinion").astype(int)


Unnamed: 0,sentence,target
0,YouTube is making clear there will be no “birt...,0
1,The increasingly bitter dispute between Americ...,0
2,So while there may be a humanitarian crisis dr...,1
3,A professor who teaches climate change classes...,0
4,"Looking around the United States, there is nev...",0


In [3]:
mpqa_df = pd.read_csv("../../data/mpqa_filtered.csv", sep = ";").rename(columns={"sent": "sentence"})
mpqa_df["target"] = np.where(mpqa_df['score'] >= 2, 1, 0)
mpqa_df.drop(columns=["score"], inplace = True)
mpqa_df.head()

Unnamed: 0,sentence,target
0,A set of tests specifically for Congo fever we...,0
1,By afternoon she was conversing with her husba...,0
2,Although the woman's condition had deteriorate...,0
3,The patient told hospital authorities she beca...,0
4,She also had a skin rash and was vomiting.,0


In [4]:
full_dataset_df = pd.concat([minified_df, mpqa_df])
dataset = (
    tf.data.Dataset.from_tensor_slices(
        (full_dataset_df['sentence'].values, full_dataset_df['target'].values)
    )
    .shuffle(full_dataset_df.shape[0])
)

In [5]:
no_opinion_text = """
Natural language processing (NLP) is an interdisciplinary subfield of linguistics, computer science, 
and artificial intelligence concerned with the interactions between computers and human language, 
in particular how to program computers to process and analyze large amounts of natural language data. 
The goal is a computer capable of understanding the contents of documents, 
including the contextual nuances of the language within them. The technology can then accurately extract 
information and insights contained in the documents as well as categorize and organize the documents themselves.
"""

In [6]:
opinion_text = """
One of my favourite films is Titanic. I`m a great fan of romantic movies and I`m very keen on the history of the tragic Titanic. 
The movie tells the dramatic story of the Titanic with Leonardo Di Caprio and Kate Winslet as the main actors.
In the year 1912 a young poor guy, Leonardo, travels by the gorgeous ship Titanic from London to New York with a big dream. 
On board he meets a fabulous, wealthy girl, Kate Winslet. He fells in love with her as soon as he sees this gorgeous girl. 
However, after a few days the ship hits an enormous iceberg and the tragedy begins to unfold. The Titanic starts sinking…
The movie is very close to the real tragedy. It shows a beautiful love story with an extremely sad ending. 
The acting is first-rate. Leonardo Di Caprio`s and Kate Winslet`s performance is so brilliant that I cried during the whole movie.
This movie brings a tear to your eyes. If you want to cry and melt down to a glamurous love story, 
I strongly recommend that you watch this movie. Titanic is well worth seeing.
"""

In [7]:
def build_model(preprocess_layer, encoder_layer):
    input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name="input")
    preprocessing_layer = preprocess_layer(input_layer)
    encoder_outputs = encoder_layer(preprocessing_layer)
    pooled_output = encoder_outputs["pooled_output"]
    output_layer = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(pooled_output)
    model = tf.keras.Model(input_layer, output_layer)
    return model

In [8]:
def make_experiment(config, name):
    
    preprocess_layer = tf_hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
        name="preprocessing"
    )

    encoder_layer = tf_hub.KerasLayer(
        "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/2",
        name="BERT_encoder",
        trainable=True
    )

    model = build_model(preprocess_layer, encoder_layer)
    model.summary()

    trainer = Trainer(config,model,dataset, name)
    trainer.compile_model()
    trainer.train()
    trainer.save_model()
    trainer.test_model()
    trainer.predict_model(no_opinion_text, "no opinion text")
    trainer.predict_model(opinion_text, "opinion text")

    def get_probabilities(text):
        prob = model.predict([text])
        return np.hstack([1 - prob, prob])

    explainer = LimeTextExplainer(class_names=["Non-biased", "Biased"])
    explanation_2 = trainer.explain_prediction(no_opinion_text, "No opinion text explanation")
    explanation_1 = trainer.explain_prediction(opinion_text,  "Opinion text explanation")
    with tf.device("/cpu:0"):
        explanation_1.show_in_notebook(text=True)
        explanation_2.show_in_notebook(text=True)
    wandb.finish()


In [9]:
config = {
    "architecture": "bigger-BERT-CNN",
    "dataset": "full_dataset",
    "datasize": minified_df.shape[0],
    "epochs": 50,
    "steps_per_epoch": tf.data.experimental.cardinality(dataset).numpy(),
    "num_train_steps": tf.data.experimental.cardinality(dataset).numpy() * 20,
    "num_warmup_steps": int(0.1 * tf.data.experimental.cardinality(dataset).numpy() * 20),
    "init_lr": 3e-5,
    "batch_size" : 1
}

In [10]:
batch_table = [1,2,4,8,16,32,64]

In [11]:
lr_table = [3e-6, 1e-5, 1e-4]

In [12]:
make_experiment(config, f"Experiment-bigger-BERT")

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_mask': (Non  0           ['input[0][0]']                  
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mandrzej-kajdasz[0m ([33mput_dl_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`


Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
