In [1]:
from tensorflow import keras
from keras import layers, callbacks, regularizers
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
from sentence_transformers import SentenceTransformer
import random
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import KNeighborsClassifier
from lib.BBData import character_dict, random_state

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from sentence_transformers import SentenceTransformer, SentencesDataset, losses
from sentence_transformers.readers import InputExample
from sentence_transformers import models
from transformers import BertModel
from torch.nn import LeakyReLU, Identity

In [None]:
from torch.utils.data import DataLoader
import torch
from sentence_transformers.losses import TripletDistanceMetric

In [None]:
batch_size = 64
lr = 1e-3
epochs = 20
patience = 6
regularizer_weight_r = 1e-4
regularizer_weight_s = 1e-3
dropout_rate = 0.2
train_size = 0.85
test_size = 0.10
# Instance state, for caching, in case of repeated usage of this metric
sentence_transformer = None
character = None
embedding_model = None
# Embedding params
embedding_size = 32
margin = embedding_size * 10
n_merged_sentences_x_sample = 5
n_triplets_x_sample = 1
training_steps = 50

create_classifier_dataset = False

In [None]:
# Function to create a dataset composed of triples from a dataset of single sentences. Used in training only.
def get_triplet_df(series_df, n_shuffles, random_state, n=3):
    # Separate lines by character from all the others
    series_df_1 = series_df[series_df['character'] == 1].copy()
    # Define triplet dataset as having a character label and the line, already encoded
    df_rows = {'character': [], 'line': []}
    # Shuffle by a parametrized amount
    for i in range(n_shuffles):
        # print("Running shuffle " + str(i) + "/" + str(n_shuffles))
        # Shuffle the dataset and balance number of 0s (we suppose its cardinality is higher than that of 1s)
        series_df_1 = series_df_1.sample(frac=1,
                                            random_state=random_state +
                                            i).reset_index(drop=True)
        # Iterate over lines
        for i in range(n, len(series_df_1)-n+1):
            # Get a triple of consecutive lines for the character, and concatenate them in one sample
            lines = ' '.join(series_df_1['line'][i - n:i + n])
            df_rows['character'].append(1)
            df_rows['line'].append(lines)
    # Create a new dataframe from the rows we have built
    df = pd.DataFrame(data=df_rows)
    # Sample the dataset one last time to shuffle it
    return df.sample(frac=1,
                        random_state=random_state).reset_index(drop=True)

In [None]:
characters = list(character_dict.keys())
if 'Default' in characters:
    characters.remove('Default')

In [None]:
def reset_state():
    sentence_transformer = None
    character = None
    embedding_model = None

In [None]:
def get_data(
    source_encoded_path,
    random_state=random_state,
    n_shuffles=10,
    use_triplets=False,
    n=3
    ):

    # Flush the instance state cache
    reset_state()

    # shuffled_df = pd.DataFrame.from_dict({'line':[], 'character':[]})
    df_list = []
    print('Loading encoded lines...')
    for c in tqdm(range(len(characters))):
        # Load the preprocessed dataset
        series_df = pd.read_csv(os.path.join(
            source_encoded_path, characters[c],
            characters[c].lower() + '_classifier.csv'),
                                dtype={
                                    'line': str,
                                    'character': int
                                })

        #print("Loaded encoded lines from " + source_encoded_path + '/' + characters[c])
        if use_triplets:
            tmp_df = get_triplet_df(series_df, n_shuffles=n_shuffles, random_state=random_state, n=n)
        else:
            tmp_df = series_df[series_df['character']==1].reset_index()[['line', 'character']]
        tmp_df['character'] = [c for _ in range(len(tmp_df))]

        # shuffled_df = pd.concat([shuffled_df, tmp_df])
        df_list.append(tmp_df)

    #print(pd.concat(df_list).sample(frac=1).head(10))

    tot_len = min([len(df) for df in df_list])
    # Store into variables the train, val, test, total lengths of the new (triplets) dataset
    train_len = int(tot_len * train_size)
    test_len = int(tot_len * test_size)
    val_len = tot_len - train_len - test_len
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    X_val = []
    y_val = []
    print('Creating merged data...')
    for shuffled_df in tqdm(df_list):
        # Load triples into numpy arrays, separating data and labels
        # print('Loading training data...')
        shuffled_df = shuffled_df.sample(frac=1)
        shuffled_df = shuffled_df.iloc[:tot_len]
        X_train += shuffled_df['line'].iloc[:train_len].tolist()
        y_train += shuffled_df['character'].iloc[:train_len].tolist()
        # print('Loading test data...')
        X_test += shuffled_df['line'].iloc[train_len:train_len +
                                                        test_len].tolist()
        y_test += shuffled_df['character'].iloc[train_len:train_len+test_len].tolist()
        # print('Loading validation data...')
        X_val += shuffled_df['line'].iloc[train_len+test_len:].to_list()
        y_val += shuffled_df['character'].iloc[train_len+test_len:].tolist()
    
    assert len([y for y in y_train if y!=0]) > 0

    return X_train, y_train, X_test, y_test, X_val, y_val

In [None]:
X_train, y_train, X_test, y_test, X_val, y_val = get_data(
    source_encoded_path=os.path.join('..', 'Data', 'Characters'),
    use_triplets=True, n=n_merged_sentences_x_sample)

In [None]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
# model.training = False
model.train()

In [None]:
dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=embedding_size, activation_function=Identity())
model.add_module('dense', dense)

In [None]:
device = torch.device("cuda:0")
model.to(device)

In [None]:
# test_embeddings = model.encode(X_test)
# kmeans_test = KMeans(n_clusters=len(characters), random_state=random_state).fit(test_embeddings)
# y_pred_kmeans_test = kmeans_test.labels_
# ConfusionMatrixDisplay.from_predictions(
#     y_test, 
#     y_pred_kmeans_test, 
#     normalize='pred',
#     display_labels=characters)
# plt.plot()

In [None]:
def get_triplet_dataset(X, y, n_triplets_x_sample, model=None, margin=None, verbose=False):
    assert len(X)==len(y)

    #n_triplets_x_sample = max(1, int(len(X) * n_triplets_x_sample))

    print('Creating triplets...')
    examples = []
    hard_negatives_count = 0
    easy_positives_count = 0
    for i in tqdm(range(len(X))):
        y_ref = y[i]

        # pos_idxs = np.squeeze(np.where(y == y_ref))
        pos_idxs = [y_i for y_i in y if y_i==y_ref]
        random.shuffle(pos_idxs)
        # neg_idxs = np.squeeze(np.where(y != y_ref))
        neg_idxs = [y_i for y_i in y if y_i!=y_ref]
        random.shuffle(neg_idxs)
        assert len(pos_idxs)>n_triplets_x_sample
        assert len(neg_idxs)>n_triplets_x_sample

        #positive = X[random.choice(pos_idxs)]
        #negative = X[random.choice(neg_idxs)]

        ### last thing to test: semi-hard negative mining

        for pos in pos_idxs[:n_triplets_x_sample]:
            for neg in neg_idxs[:n_triplets_x_sample]:
                positive = X[pos]
                negative = X[neg]

                if model is not None:
                    anchor_emb = np.array(model.encode(X[i]))
                    positive_emb = np.array(model.encode(positive))
                    negative_emb = np.array(model.encode(negative))

                    dist_ap = np.linalg.norm(anchor_emb - positive_emb)
                    dist_an = np.linalg.norm(anchor_emb - negative_emb)

                    if dist_ap < dist_an: 
                        if dist_an < dist_ap + margin:
                            examples.append(InputExample(texts=[X[i], positive, negative]))
                        else:
                            easy_positives_count += 1
                    else:
                        hard_negatives_count += 1
                else:
                    examples.append(InputExample(texts=[X[i], positive, negative]))
        
    if model is not None and verbose:
        print('Dataset length:      ', len(examples))
        print('Hard negatives count:', hard_negatives_count)
        print('Easy positives count:', easy_positives_count)

    random.shuffle(examples)

    return examples

In [None]:
# train_examples = [InputExample(texts=[X_train[i]], label=y_train[i]) for i in range(len(X_train))]
# test_examples = [InputExample(texts=[X_test[i]], label=y_test[i]) for i in range(len(X_test))]
# val_examples = [InputExample(texts=[X_val[i]], label=y_val[i]) for i in range(len(X_val))]

# train_examples = get_triplet_dataset(X_train, y_train, n_triplets_x_sample)
test_examples = get_triplet_dataset(X_test, y_test, n_triplets_x_sample)
val_examples = get_triplet_dataset(X_val, y_val, n_triplets_x_sample)

In [None]:
# train_dataset = SentencesDataset(train_examples, model)
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
# train_loss = losses.TripletLoss(
#     model=model, 
#     triplet_margin=margin,
#     distance_metric=TripletDistanceMetric.EUCLIDEAN
#     )


In [None]:
# model.fit([(train_dataloader, train_loss)], epochs=epochs, optimizer_params={'lr': lr}, show_progress_bar=True)

In [None]:
train_loss = losses.TripletLoss(
    model=model, 
    triplet_margin=margin,
    distance_metric=TripletDistanceMetric.EUCLIDEAN
    )
for n_merged_sentences_x_sample in range(training_steps):
    print('#'*100)
    print(f'step {n_merged_sentences_x_sample+1}/{training_steps}')

    train_examples = get_triplet_dataset(X_train, y_train, n_triplets_x_sample, model, margin, verbose=True)
    train_dataset = SentencesDataset(train_examples, model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    
    model.fit([(train_dataloader, train_loss)], epochs=epochs, optimizer_params={'lr': lr}, show_progress_bar=True)

model.save(os.path.join('..', 'Data', 'Metrics', 'distil_bert_embedder'))
    



In [None]:
test_embeddings = model.encode(X_test)

In [None]:
test_embeddings

In [None]:
kmeans = KMeans(n_clusters=len(characters), random_state=random_state).fit(test_embeddings)

In [None]:
cluster_to_char = {}
for c in range(len(characters)):
    char_ref = X_train[y_train.index(c)]
    char_emb = model.encode([char_ref])
    cluster_to_char[kmeans.predict(char_emb)[0]] = c

In [None]:
cluster_to_char

In [None]:
y_pred_kmeans = kmeans.labels_
y_pred_kmeans = [cluster_to_char[y] for y in y_pred_kmeans]

In [None]:
y_pred_kmeans

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_test, 
    y_pred_kmeans, 
    normalize='true',
    display_labels=characters)
plt.plot()

In [None]:
from os.path import join
from transformers import DataCollatorForLanguageModeling, AutoTokenizer, TFAutoModelForCausalLM, AdamWeightDecay
from lib.BBData import character_dict, model_name
from lib.BBDataLoad import dialogpt_preprocess_function, load_char_df, get_chatbot_predictions, merge_df_for_metrics
from lib.wip.frequency import sentence_preprocess

base_folder = '..'
out_folder = os.path.join(base_folder, 'Data', 'Characters')
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=join("..", "cache"))
tokenizer.pad_token = '#'
data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')


### create dataset
n_tests = 1
doc_test = {c:[] for c in characters}
batch_size = 128
override_predictions = False
predictions = {c:[] for c in characters}
raw_predictions = {c:[] for c in characters}
print('Creating dataset...')
if n_tests > 1 and not override_predictions:
    raise Exception('must override previous predictions if you need more tests')

for character in characters:
    print('Character: ', character)
    for i in range(n_tests):
        print(f'Test {i+1}/{n_tests}')
        character_checkpoint = join(out_folder, character, character_dict[character]['checkpoint_folder'])
        model_chatbot = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=character_checkpoint) if override_predictions else None
        if model_chatbot:
            model_chatbot.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))

        character_hg = load_char_df(character, base_folder)
        # This transform in a sequence of tokens ours dataset
        tokenized_character_hg = character_hg.map(lambda row: dialogpt_preprocess_function(row, tokenizer), batched=False)

        # Define tensorflow datasets
        encoded_test_set = tokenized_character_hg["test"].to_tf_dataset(
            columns=["input_ids", "attention_mask", "labels"],
            shuffle=False,
            batch_size=batch_size,
            collate_fn=data_collator,
        )

        # Takes the testset as sample question 
        sample_questions = character_hg['test']['context/0']

        # Sampling generation method
        predictions_sampling = get_chatbot_predictions(
            sample_questions,
            model_chatbot,
            character_dict[character]['prediction_filename'] + '_sampling.json',
            "Sampling",
            character,
            tokenizer,
            base_folder,
            override_predictions=override_predictions
        )
                                                    
        sentences = merge_df_for_metrics(character_hg['test'], None, None, predictions_sampling, tokenizer)['prd_sampling'].tolist()
        doc_test[character].append([sentence_preprocess(s)[0] for s in sentences])

In [None]:
from sklearn.utils import shuffle

df_list = []
X_test_chatbot = []
y_test_chatbot = []
for c in tqdm(range(len(characters))):
    # Load the preprocessed dataset
    lines = doc_test[characters[c]][0]
    series_df = {
        'character': [1 for _ in range(len(lines))], 
        'line': lines}
    series_df = pd.DataFrame.from_dict(series_df)

    #print("Loaded encoded lines from " + source_encoded_path + '/' + characters[c])
    tmp_df = get_triplet_df(series_df, n_shuffles=10, random_state=random_state, n=n_merged_sentences_x_sample)
    y_test_chatbot += [c for _ in range(len(tmp_df))]
    X_test_chatbot += tmp_df['line'].tolist()

X_test_chatbot, y_test_chatbot = shuffle(X_test_chatbot, y_test_chatbot, random_state=random_state)


In [None]:
test_chat_embeddings = model.encode(X_test_chatbot)

In [None]:
y_pred_kmeans_chat = kmeans.predict(test_chat_embeddings)
y_pred_kmeans_chat = [cluster_to_char[y] for y in y_pred_kmeans_chat]

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_test_chatbot, 
    y_pred_kmeans_chat, 
    normalize='true',
    display_labels=characters)
plt.plot()

In [None]:
os.system('shutdown -h')