In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as tf_hub
import tensorflow_text as tf_text
import nltk

from official.nlp import optimization
from lime.lime_text import LimeTextExplainer

In [None]:
labeled_df = pd.read_excel("../../data/labeled_dataset.xlsx").rename(columns={"Unnamed: 0": "id"})
labeled_df.head()

In [None]:
labeled_df["Label_opinion"].value_counts()

In [None]:
minified_df = labeled_df[["sentence", "Label_opinion"]]
minified_df["target"] = (minified_df["Label_opinion"] == "Expresses writer’s opinion").astype(int)
minified_df = minified_df.drop("Label_opinion", axis=1)
minified_df.head()

In [None]:
dataset = (
    tf.data.Dataset.from_tensor_slices(
        (minified_df['sentence'].values, minified_df['target'].values)
    )
    .shuffle(minified_df.shape[0])
    .batch(32)
)

In [None]:
preprocess_layer = tf_hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
    name="preprocessing"
)

encoder_layer = tf_hub.KerasLayer(
    "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/2",
    name="BERT_encoder",
    trainable=True
)

In [None]:
def build_model(preprocess_layer, encoder_layer):
    input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name="input")
    preprocessing_layer = preprocess_layer(input_layer)
    encoder_outputs = encoder_layer(preprocessing_layer)
    pooled_output = encoder_outputs["pooled_output"]
    output_layer = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(pooled_output)
    model = tf.keras.Model(input_layer, output_layer)
    return model


model = build_model(preprocess_layer, encoder_layer)
model.summary()

In [None]:
loss = tf.keras.losses.BinaryCrossentropy()
metrics = tf.metrics.BinaryAccuracy()

epochs = 20
steps_per_epoch = tf.data.experimental.cardinality(dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1 * num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,    
    num_warmup_steps=num_warmup_steps,
    optimizer_type='adamw'
)

model.compile(
    optimizer=optimizer,
    loss=loss,  
    metrics=metrics
)

In [None]:
history = model.fit(
    x=dataset,
    epochs=epochs
)
model.save_weights("../models/small_bert_model.h5")

In [None]:
history_dict = history.history
plt.plot(history_dict['loss'], label='loss')
plt.plot(history_dict['binary_accuracy'], label='accuracy')
plt.legend()
plt.show()

In [None]:
model.load_weights("../models/small_bert_model.h5")

In [None]:
model.predict([
    "As reported by the police", 
    "I would say that the police is not doing a good job"
])

In [None]:
model.save("../models/small_bert_model")

In [None]:
loaded_model = tf.keras.models.load_model("../models/small_bert_model", compile=False)

In [None]:
tf.__version__

In [None]:
model.predict([
    "The government made a decision to cut military spending", 
    "The government made a dumb decision to cut military spending"
])

In [None]:
test_article = """
Meanwhile, the Democratic National Committee (DNC) doesn’t have any primary debates scheduled – a move that helps President Joe Biden and hurts his challengers Robert F. Kennedy Jr. and Marianne Williamson. It’s clear the DNC doesn’t see Kennedy or Williamson as serious contenders, and Biden seems to prefer a Rose Garden strategy, where the focus of his campaign revolves around being the president and showing the country how he does the job.
Having fewer debates, however, is not necessarily a bad thing. But quantity is not the issue here. Quality is.
Of course, the number of debates for each party is subject to change. Trump, for instance, could simply be bluffing to add some intrigue. And a more serious Biden challenger could enter the race, prompting the DNC to schedule a debate.
But, at this rate, it’s possible that the debates will play a much smaller role in 2024 than they did in 2016 or in 2020. And if the two parties end up nominating Biden and Trump, many Americans may also choose to tune out any debates, thinking they are already quite familiar with both candidates.
That would be a departure from the norm, since debates have been a big part of presidential politics since World War II. Richard Nixon famously squared off against John F. Kennedy in 1960, with an estimated 70 million Americans tuning in to the first of a series of live televised debates between the two major presidential candidates. Nixon seemed nervous and sickly next to the young senator, whose charisma and good looks became the stuff of TV legend.
"""

In [None]:
test_sentences = nltk.sent_tokenize(test_article)
detected_bias = model.predict(test_sentences)

# for sentence, bias in zip(test_sentences, detected_bias):
#     print(f"{sentence}: {bias[0]}")
detected_bias

In [None]:
def get_probabilities(text):
    prob = model.predict([text])
    return np.hstack([1 - prob, prob])

explainer = LimeTextExplainer(class_names=["Non-biased", "Biased"])
explanation = explainer.explain_instance(test_article, get_probabilities, num_features=10)

In [None]:
for sentence, bias in zip(test_sentences, detected_bias):
    explainer.explain_instance(sentence, get_probabilities, num_features=10).show_in_notebook(text=True)