In [None]:
from tensorflow import keras
from keras import layers, callbacks, regularizers
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
from sentence_transformers import SentenceTransformer
import random
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from lib.BBData import character_dict, random_state, model_name
from transformers import BertModel, BertConfig, TFAutoModelForCausalLM, TFBertModel

In [None]:
batch_size = 16
lr = 1e-4
patience = 6
regularizer_weight_r = 1e-4
regularizer_weight_s = 1e-3
dropout_rate = 0.2
train_size = 0.85
test_size = 0.10
# Instance state, for caching, in case of repeated usage of this metric
sentence_transformer = None
character = None
embedding_model = None
# Embedding params
embedding_size = 32
margin = 0.5*embedding_size

create_classifier_dataset = False

In [None]:
def reset_state():
    sentence_transformer = None
    character = None
    embedding_model = None

In [None]:
from keras.activations import linear

def create_embedding_model(input_size):
    # Input is a concatenated triplet of sentences
    inputs = keras.Input(shape=input_size)
    # Model is a concatenation of dense layers alternated by batch normalizations
    x = layers.Dense(
        1024,
        activation='relu',
    )(inputs)
    x = layers.BatchNormalization()(x)

    x = layers.Dense(
        1024,
        activation='relu',
    )(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(
        512,
        activation='relu',
    )(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(
        256,
        activation='relu')(x)
    x = layers.BatchNormalization()(x)
    # The last layers have L2 regularization, to avoid too high values
    x = layers.Dense(
        128,
        activation='relu')(x)
    x = layers.BatchNormalization()(x)
    # The last layers have L2 regularization, to avoid too high values
    x = layers.Dense(
        64,
        activation='relu')(x)
    x = layers.BatchNormalization()(x)
    # The output is the embedding for the input
    out = layers.Dense(
        embedding_size,
        activation=linear)(x)
    # Create and compile keras model
    embedding_model = keras.Model(inputs, out)
    return embedding_model

In [None]:
# Create siamese network for training
def create_siamese_net(input_size, embedding_model):
    input_anchor = layers.Input(shape=(input_size, ))
    input_positive = layers.Input(shape=(input_size, ))
    input_negative = layers.Input(shape=(input_size, ))

    embedding_anchor = embedding_model(input_anchor)
    embedding_positive = embedding_model(input_positive)
    embedding_negative = embedding_model(input_negative)

    output = layers.concatenate(
        [embedding_anchor, embedding_positive, embedding_negative], axis=1)

    siamese_net = keras.Model(
        [input_anchor, input_positive, input_negative], output)

    return siamese_net

In [None]:
# Function to create a dataset composed of triples from a dataset of single sentences. Used in training only.
def get_triplet_df(series_df, n_shuffles, random_state):
    # Separate lines by character from all the others
    series_df_1 = series_df[series_df['character'] == 1].copy()
    # Define triplet dataset as having a character label and the line, already encoded
    df_rows = {'character': [], 'encoded_line': []}
    # Shuffle by a parametrized amount
    for i in range(n_shuffles):
        # print("Running shuffle " + str(i) + "/" + str(n_shuffles))
        # Shuffle the dataset and balance number of 0s (we suppose its cardinality is higher than that of 1s)
        series_df_1 = series_df_1.sample(frac=1,
                                            random_state=random_state +
                                            i).reset_index(drop=True)
        # Iterate over lines
        n = 4
        for i in range(n, len(series_df_1)-n):
            # Get a triple of consecutive lines for the character, and concatenate them in one sample
            lines = list(series_df_1['encoded_line'][i - n:i + n])
            lines = np.concatenate(lines)
            df_rows['character'].append(1)
            df_rows['encoded_line'].append(lines)
    # Create a new dataframe from the rows we have built
    df = pd.DataFrame(data=df_rows)
    # Sample the dataset one last time to shuffle it
    return df.sample(frac=1,
                        random_state=random_state).reset_index(drop=True)

In [None]:
characters = list(character_dict.keys())
if 'Default' in characters:
    characters.remove('Default')

In [None]:
sentence_transformer = SentenceTransformer(
        "sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
if create_classifier_dataset:
    for character in characters:
        source = character_dict[character]['source']
        series_df_path = os.path.join('..', 'Data', 'Sources', source, source+'.csv')
        series_df = pd.read_csv(series_df_path)
        
        # Apply class labelling to the dataset sentences
        series_df['character'] = series_df['character'].apply(
            lambda x: 1 if x == character else 0)
        # Throw away unnecessary dataset rows
        series_df = series_df[['character', 'line']]
        # Encode lines and add them to the dataset as a new row
        series_df['encoded_line'] = [
            sentence_transformer.encode(line)
            for line in tqdm(series_df['line'])
        ]
        # Save the dataset rows as a csv file
        p = os.path.join('..', 'Data', 'Characters', character)
        save_path = os.path.join(p, character.lower()+'_classifier.csv')
        series_df[['line', 'character']].to_csv(save_path, index=False)
        # The encoded lines are saved separately via numpy due to their type (array)
        np.save(os.path.join(os.path.join(p, character.lower() + '_encoded_lines.npy')),
                series_df['encoded_line'].to_numpy())
        print("Saved encoded lines at " + p)

In [None]:
def get_data(
    source_encoded_path,
    random_state=random_state,
    n_shuffles=10,
    use_triplets=False
    ):

    # Flush the instance state cache
    reset_state()

    # shuffled_df = pd.DataFrame.from_dict({'line':[], 'character':[]})
    df_list = []
    print('Loading encoded lines...')
    for c in tqdm(range(len(characters))):
        # Load the preprocessed dataset
        series_df = pd.read_csv(os.path.join(
            source_encoded_path, characters[c],
            characters[c].lower() + '_classifier.csv'),
                                dtype={
                                    'line': str,
                                    'character': int
                                })

        # Load encoded lines dataset, via numpy, and add it as a new row in the dataset
        series_df['encoded_line'] = np.load(os.path.join(
            source_encoded_path, characters[c],
            characters[c].lower() + '_encoded_lines.npy'),
                                            allow_pickle=True)
        #print("Loaded encoded lines from " + source_encoded_path + '/' + characters[c])
        if use_triplets:
            tmp_df = get_triplet_df(series_df, n_shuffles=n_shuffles, random_state=random_state)
        else:
            tmp_df = series_df[series_df['character']==1].reset_index()[['encoded_line', 'character']]
        tmp_df['character'] = [c for _ in range(len(tmp_df))]

        # shuffled_df = pd.concat([shuffled_df, tmp_df])
        df_list.append(tmp_df)

    #print(pd.concat(df_list).sample(frac=1).head(10))

    tot_len = min([len(df) for df in df_list])
    # Store into variables the train, val, test, total lengths of the new (triplets) dataset
    train_len = int(tot_len * train_size)
    test_len = int(tot_len * test_size)
    val_len = tot_len - train_len - test_len
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    X_val = []
    y_val = []
    print('Creating merged data...')
    for shuffled_df in tqdm(df_list):
        # Load triples into numpy arrays, separating data and labels
        # print('Loading training data...')
        shuffled_df = shuffled_df.sample(frac=1)
        shuffled_df = shuffled_df.iloc[:tot_len]
        X_train += [[float(e) for e in s]
                for s in shuffled_df['encoded_line'].iloc[:train_len]]
        y_train += shuffled_df['character'].iloc[:train_len].tolist()
        # print('Loading test data...')
        X_test += [[float(e) for e in s]
                for s in shuffled_df['encoded_line'].iloc[train_len:train_len +
                                                        test_len]]
        y_test += shuffled_df['character'].iloc[train_len:train_len + test_len].tolist()
        # print('Loading validation data...')
        X_val += [[float(e) for e in s]
                for s in shuffled_df['encoded_line'].iloc[train_len +
                                                        test_len:train_len +
                                                        test_len + val_len]]
        y_val += shuffled_df['character'].iloc[train_len+test_len:train_len+test_len+val_len].tolist()
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    X_val = np.array(X_val)
    y_val = np.array(y_val)

    assert len(y_test[y_test!=0]) > 0, 'assertion before randomization'

    return X_train, y_train, X_test, y_test, X_val, y_val

In [None]:
X_train, y_train, X_test, y_test, X_val, y_val = get_data(
    source_encoded_path=os.path.join('..', 'Data', 'Characters'),
    use_triplets=False)

In [None]:
import tensorflow as tf
def get_triplet_dataset(X, y):
    assert len(X)==len(y)

    anchors = X
    positives = []
    negatives = []
    print('Creating triplets...')
    for i in tqdm(range(len(X))):
        y_ref = y[i]

        pos_idxs = np.squeeze(np.where(y == y_ref))
        neg_idxs = np.squeeze(np.where(y != y_ref))

        positives.append(X[random.choice(pos_idxs)])
        negatives.append(X[random.choice(neg_idxs)])
    
    anchors = np.array(anchors)[:,np.newaxis,:]
    positives = np.array(positives)[:,np.newaxis,:]
    negatives = np.array(negatives)[:,np.newaxis,:]

    anchor_dataset = tf.data.Dataset.from_tensor_slices(anchors)
    positive_dataset = tf.data.Dataset.from_tensor_slices(positives)
    negative_dataset = tf.data.Dataset.from_tensor_slices(negatives)

    print('Zipping to unique dataset...')
    dataset = tf.data.Dataset.zip((anchor_dataset, positive_dataset, negative_dataset))
    print('Shuffling...')
    dataset = dataset.shuffle(buffer_size=1024)
    return dataset

In [None]:
train_dataset = get_triplet_dataset(X_train, y_train)
test_dataset = get_triplet_dataset(X_test, y_test)
val_dataset = get_triplet_dataset(X_val, y_val)

In [None]:
class DistanceLayer(layers.Layer):
    """
    This layer is responsible for computing the distance between the anchor
    embedding and the positive embedding, and the anchor embedding and the
    negative embedding.
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def call(self, anchor, positive, negative):
        ap_distance = tf.reduce_sum(tf.square(anchor - positive), -1)
        an_distance = tf.reduce_sum(tf.square(anchor - negative), -1)
        return (ap_distance, an_distance)

In [None]:
class SiameseModel(keras.Model):
    """The Siamese Network model with a custom training and testing loops.

    Computes the triplet loss using the three embeddings produced by the
    Siamese Network.

    The triplet loss is defined as:
       L(A, P, N) = max(‖f(A) - f(P)‖² - ‖f(A) - f(N)‖² + margin, 0)
    """

    def __init__(self, siamese_network, margin=0.5):
        super(SiameseModel, self).__init__()
        self.siamese_network = siamese_network
        self.margin = margin
        self.loss_tracker = keras.metrics.Mean(name="loss")

    def call(self, inputs):
        return self.siamese_network(inputs)

    def train_step(self, data):
        # GradientTape is a context manager that records every operation that
        # you do inside. We are using it here to compute the loss so we can get
        # the gradients and apply them using the optimizer specified in
        # `compile()`.
        with tf.GradientTape() as tape:
            loss = self._compute_loss(data)

        # Storing the gradients of the loss function with respect to the
        # weights/parameters.
        gradients = tape.gradient(loss, self.siamese_network.trainable_weights)

        # Applying the gradients on the model using the specified optimizer
        self.optimizer.apply_gradients(
            zip(gradients, self.siamese_network.trainable_weights)
        )

        # Let's update and return the training loss metric.
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def test_step(self, data):
        loss = self._compute_loss(data)

        # Let's update and return the loss metric.
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

    def _compute_loss(self, data):
        # The output of the network is a tuple containing the distances
        # between the anchor and the positive example, and the anchor and
        # the negative example.
        ap_distance, an_distance = self.siamese_network(data)

        # Computing the Triplet Loss by subtracting both distances and
        # making sure we don't get a negative value.
        loss = ap_distance - an_distance
        loss = tf.maximum(loss + self.margin, 0.0)
        return loss

    @property
    def metrics(self):
        # We need to list our metrics here so the `reset_states()` can be
        # called automatically.
        return [self.loss_tracker]

In [None]:
input_size = X_train[0].shape

In [None]:
use_benderbot = True

In [None]:
embedding = create_embedding_model(input_size)

In [None]:
# embedding = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join('..', "cache")) # not working
# embedding = TFBertModel.from_pretrained("bert-base-cased")

In [None]:

anchor_input = layers.Input(name="anchor", shape=input_size)
positive_input = layers.Input(name="positive", shape=input_size)
negative_input = layers.Input(name="negative", shape=input_size)

# embedding = create_embedding_model(input_size)

distances = DistanceLayer()(
    embedding(anchor_input),
    embedding(positive_input),
    embedding(negative_input),
)

siamese_network = keras.Model(
    inputs=[anchor_input, positive_input, negative_input], outputs=distances,
    
)


In [None]:
earlystop_callback = callbacks.EarlyStopping(
    monitor="loss",
    min_delta=0,
    patience=5,
    verbose=0,
    mode="min",
    baseline=None,
    restore_best_weights=True,
)

In [None]:
siamese_model = SiameseModel(siamese_network, margin=margin)
siamese_model.compile(optimizer=keras.optimizers.Adam(lr), weighted_metrics=[])
siamese_model.fit(
    train_dataset, 
    epochs=5000, 
    validation_data=val_dataset, 
    batch_size=128,
    callbacks=[earlystop_callback]
    )

In [None]:
test_embeddings = embedding(X_test).numpy()

In [None]:
from sklearn.cluster import KMeans, DBSCAN

kmeans = KMeans(n_clusters=len(characters), random_state=random_state).fit(test_embeddings)
dbscan = DBSCAN().fit(test_embeddings)

In [None]:
len(test_embeddings)

In [None]:
y_pred_kmeans = kmeans.labels_
y_pred_dbscan = dbscan.labels_

In [None]:
y_pred_dbscan

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay


ConfusionMatrixDisplay.from_predictions(
    y_test, 
    y_pred_kmeans, 
    normalize='true',
    display_labels=characters)
plt.plot()

In [None]:
train_embeddings = embedding(X_train).numpy()
kmeans = KMeans(n_clusters=len(characters), random_state=random_state).fit(train_embeddings)
y_pred = kmeans.labels_
ConfusionMatrixDisplay.from_predictions(
    y_train, 
    y_pred, 
    normalize='true',
    display_labels=characters)
plt.plot()

In [None]:
dbscan = DBSCAN(eps=0.2).fit(train_embeddings)
labels = dbscan.labels_
# labels[labels!=0]
print(max(labels))