In [None]:
'''
This notebook was run in a Google Colab environement.
This notebook contains the code for the second experiment
'''

'\nThis notebook was run in a Google Colab environement.\nThis notebook contains the code for the first experiment\n'

In [1]:
## Colab cell
# Mount drive
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/Research Project")
!pip3 install tensorflow
!pip3 install gpflow==2.9.1
!pip3 install sentence-transformers
!pip3 install langchain_text_splitters

Mounted at /content/drive
Collecting gpflow==2.9.1
  Downloading gpflow-2.9.1-py3-none-any.whl.metadata (13 kB)
Collecting check-shapes>=1.0.0 (from gpflow==2.9.1)
  Downloading check_shapes-1.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting deprecated (from gpflow==2.9.1)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Collecting dropstackframe>=0.1.0 (from check-shapes>=1.0.0->gpflow==2.9.1)
  Downloading dropstackframe-0.1.1-py3-none-any.whl.metadata (4.3 kB)
Collecting lark<2.0.0,>=1.1.0 (from check-shapes>=1.0.0->gpflow==2.9.1)
  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)
Downloading gpflow-2.9.1-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.6/380.6 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading check_shapes-1.1.1-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Depreca

# Generate embeddings

In [7]:
# read in data
import pandas as pd
import ast
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from sentence_transformers import SentenceTransformer
results_df = pd.read_csv("data/mistral_run2_by_topic.csv")
print(results_df.shape)
# preprocessing of true answer
def preprocess_true_answer(row):
    '''
    Reads in the true answer column correctly. It is written as a numpy array containing a string, as csv reads in numpy as a string itself.
    "['e.g']" -> 'e.g'
    '''
    convert_from_string = ast.literal_eval(row["true_answer"])
    if len(convert_from_string) == 0:
        convert_from_string = ["Context does not contain the answer."]

    return convert_from_string[0]

def preprocess_llm_answer(row):
    llm_answer = row["llm_answer"]
    if "Context does not contain the answer" in llm_answer:
        llm_answer = "Context does not contain the answer."
    return llm_answer

results_df["true_answer"] = results_df.apply(preprocess_true_answer, axis=1)
results_df["llm_answer"] = results_df.apply(preprocess_llm_answer, axis=1)

  from tqdm.autonotebook import tqdm, trange


(11851, 6)


In [8]:
def get_top_topics_df(df, num_topics):
    '''
    This function retrieves (num_topics) many topics that contain the most amount of data points. (n-largest)
    '''
    counts_by_topic = df["topic"].value_counts()
    top_topics = counts_by_topic.nlargest(num_topics).index
    top_topics_df = df[df["topic"].isin(top_topics)]
    return top_topics_df.reset_index(drop=True)

results_df = get_top_topics_df(results_df, num_topics=11) # get 11 topics
results_df.shape

(7325, 6)

In [None]:
# Remove long question+contexts

text_splitter = SentenceTransformersTokenTextSplitter(model_name="sentence-transformers/all-mpnet-base-v2") # tokenise according to mpnet-base ST
all_questions = results_df["question"].tolist()
all_contexts = results_df["context"].tolist()
all_question_context_combined = [f"{question} {context}" for question,context in zip(all_questions, all_contexts)] # string strategy

num_tokens_question_context_combined = [text_splitter.count_tokens(text=entry) - 2 for entry in all_question_context_combined] # count tokens for string strategy
indices_too_long = [index for index, token_count in enumerate(num_tokens_question_context_combined) if token_count > 382] # indices that have a token length of > 384 and hence will be truncated

results_df = results_df.drop(indices_too_long).reset_index(drop=True)
results_df.shape

In [None]:
text_splitter = SentenceTransformersTokenTextSplitter(model_name="sentence-transformers/all-distilroberta-v1") # tokenise according to distilroberta ST
all_questions = results_df["question"].tolist()
all_contexts = results_df["context"].tolist()
all_question_context_combined = [f"{question} {context}" for question,context in zip(all_questions, all_contexts)] # string strategy

num_tokens_question_context_combined = [text_splitter.count_tokens(text=entry) - 2 for entry in all_question_context_combined] # count tokens for string strategy
indices_too_long = [index for index, token_count in enumerate(num_tokens_question_context_combined) if token_count > 510] # indices that have a token length of > 512 and hence will be truncated
results_df = results_df.drop(indices_too_long).reset_index(drop=True)

print(results_df.shape)

In [None]:
def get_embedding(sentences_list, model):
    '''
    This function produces an embedding for a list of sentences that is provided to the function.
    '''
    pool = model.start_multi_process_pool()
    embeddings = model.encode_multi_process(sentences_list, pool)
    model.stop_multi_process_pool(pool)
    return embeddings.tolist()

## Create input embeddings
all_questions = results_df["question"].tolist()
all_contexts = results_df["context"].tolist()
all_question_context_combined = [f"{question} {context}" for question,context in zip(all_questions, all_contexts)]

# load sentence transformer model
model_list = {
    "mpnet" : SentenceTransformer("all-mpnet-base-v2"),
    "distill_roberta" : SentenceTransformer("all-distilroberta-v1")
}

for input_representation in ["mpnet", "distill_roberta"]:
    print("Input ", input_representation)
    # string strat
    results_df[f"question_and_context_{input_representation}"] = get_embedding(all_question_context_combined, model_list[input_representation])

    # concat strat
    results_df[f"question_{input_representation}"] = get_embedding(all_questions, model_list[input_representation])
    results_df[f"context_{input_representation}"] = get_embedding(all_contexts, model_list[input_representation])

In [None]:
def calc_similarity_score_column(column1, column2, model):
    '''
    Calculate the similarity scores (pairwise) between the first and second column. (dataset answer and llm answer)
    column1 and column2 is list of strings.
    '''
    column1_embedding = model.encode(column1)
    column2_embedding = model.encode(column2)
    similarity_scores = model.similarity_pairwise(column1_embedding, column2_embedding)
    return similarity_scores.numpy()

# calculate targets
all_true_answers_list = results_df["true_answer"].tolist()
all_llm_answers_list = results_df["llm_answer"].tolist()

for target_representation in ["mpnet", "distill_roberta"]:
    results_df[f"{target_representation}_score"] = calc_similarity_score_column(all_true_answers_list, all_llm_answers_list, model_list[target_representation])


In [None]:
results_df.to_csv("data/selected_topics_embeddings_multiple_inputs.csv", index=False) # save to csv

# Fit GP

In [2]:
# Import
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from gpflow.models import GPR
from gpflow.kernels import RationalQuadratic
from gpflow.mean_functions import Zero
import matplotlib.pyplot as plt
import time
import pickle
import scipy.stats as stats
import gpflow

SEED = 2504

# Read in generated embeddings and targets
results_df = pd.read_csv("data/selected_topics_embeddings_multiple_inputs.csv") # Dataset for experiment 2


## correctly read in saved embeddings
# mpnet embeddings
results_df["question_mpnet"] = [ast.literal_eval(x) for x in results_df["question_mpnet"]]
results_df["context_mpnet"] = [ast.literal_eval(x) for x in results_df["context_mpnet"]]
# results_df["question_and_context_mpnet"] = [ast.literal_eval(x) for x in results_df["question_and_context_mpnet"]]

# # distill_roberta_embeddings
# results_df["question_distill_roberta"] = [ast.literal_eval(x) for x in results_df["question_distill_roberta"]]
# results_df["context_distill_roberta"] = [ast.literal_eval(x) for x in results_df["context_distill_roberta"]]
# results_df["question_and_context_distill_roberta"] = [ast.literal_eval(x) for x in results_df["question_and_context_distill_roberta"]]

# create OHE vector
ohe_topics = pd.get_dummies(results_df['topic'], prefix='topic', dtype="int")
results_df["topic_OHE"] = ohe_topics.values.tolist()

In [None]:
def preds_vs_truth_plot(train_true_vals, train_pred_vals, test_true_vals, test_pred_vals, file_name=None):
    fig, axs = plt.subplots(1, 2, figsize=(12,8))

    ## plot train data
    axs[0].scatter(train_true_vals, train_pred_vals, alpha=0.4, label=f"Train data", color="blue") # plot train points

    # plot line of perfec predictions for comparison
    min_value_train = min(np.min(train_true_vals), np.min(train_pred_vals))
    max_value_train = max(np.max(train_true_vals), np.max(train_pred_vals))
    axs[0].plot(np.arange(min_value_train, max_value_train, step=0.001),
                np.arange(min_value_train, max_value_train, step=0.001),
                label="Perfect predictions", color="red")
    axs[0].set_xlabel("True score")
    axs[0].set_ylabel("Predicted score")
    axs[0].legend(fontsize="x-small")
    axs[0].set_title("Train", fontsize=10)

    ## plot test data
    axs[1].scatter(test_true_vals, test_pred_vals, alpha=0.4, label=f"Train data", color="blue") # plot test points

    # plot line of perfect predictions for comparison
    min_val_test = min(np.min(test_true_vals), np.min(test_pred_vals))
    max_val_test = max(np.max(test_true_vals), np.max(test_pred_vals))

    axs[1].plot(np.arange(min_val_test, max_val_test, step=0.001),
                np.arange(min_val_test, max_val_test, step=0.001),
                label="Perfect predictions", color="red")
    axs[1].set_xlabel("True score")
    axs[1].set_ylabel("Predicted score")
    axs[1].legend(fontsize="x-small")
    axs[1].set_title("Test", fontsize=10)

    plt.tight_layout()
    if file_name is not None:
        plt.savefig(file_name)

    plt.show()

In [None]:
def gp_model_init(x_train, y_train, lengthscale, alpha, noise):
    # Initilaise GP with data and hyperparameters provided
    return GPR(
        data=(x_train, y_train),
        kernel= RationalQuadratic(lengthscales=lengthscale, alpha=alpha),
        mean_function=Zero(), # zero mean function
        noise_variance=noise
    )

def fit_gp_function(results_df, all_gp_inputs, target, save_folder, data_category, condition=None, num_restarts=20):
    best_model, best_log_marginal_likelihood_val = None, -np.inf

    if condition is not None:
        # Answerable or unanswerble data
        all_targets = results_df[condition][target].to_numpy().reshape(-1, 1)
        title_col = results_df[condition]["topic"]

    else:
        # full data
        all_targets = results_df[target].to_numpy().reshape(-1, 1)
        title_col = results_df["topic"]

    # 80-20 trian-test split
    x_train, x_test, y_train, y_test= train_test_split(all_gp_inputs, all_targets, test_size=0.20,
                                                       stratify=title_col, random_state=SEED)

    # scale
    x_scaler = StandardScaler()
    x_train_scaled = x_scaler.fit_transform(x_train)
    x_test_scaled = x_scaler.transform(x_test)

    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train)

    for i in range(num_restarts):
        print(f"Restart {i}")
        # generate random initialisation vals
        lengthscale_init = stats.loguniform.rvs(0.01, 100)
        alpha_init = stats.loguniform.rvs(0.01, 100)
        noise_init = stats.loguniform.rvs(0.01, 100)

        print(lengthscale_init, alpha_init, noise_init)

        # initialise Gp
        model = gp_model_init(x_train_scaled, y_train_scaled, lengthscale_init, alpha_init, noise_init)
        opt = gpflow.optimizers.Scipy()
        opt.minimize(model.training_loss, model.trainable_variables) # perform optimization
        log_marginal_likelihood_val = model.log_marginal_likelihood().numpy() # return log marginal likelihood
        print(log_marginal_likelihood_val)
        # update if higher log marginal likelihood found
        if log_marginal_likelihood_val > best_log_marginal_likelihood_val:
            best_log_marginal_likelihood_val = log_marginal_likelihood_val
            best_model = model

    mean_train, _ = best_model.predict_y(x_train_scaled)
    mean_test, _ = best_model.predict_y(x_test_scaled)

    mean_test_rev = y_scaler.inverse_transform(mean_test.numpy())

    # calculate eval metrics
    mae = mean_absolute_error(y_test.squeeze(), mean_test_rev)
    rmse = mean_squared_error(y_test.squeeze(), mean_test_rev, squared=False)
    r2 = r2_score(y_test.squeeze(), mean_test_rev)

    # preds_vs_truth_plot(y_scaler.inverse_transform(y_train_scaled), y_scaler.inverse_transform(mean_train.numpy()),
    #                     y_test, mean_test_rev)

    print(f"Best Log Likelihood: {best_log_marginal_likelihood_val}")
    print(f"MAE : {mae}")
    print(f"RMSE : {rmse}")
    print(f"R2 : {r2}")

    # save preds and metrics
    train_results_df = pd.DataFrame({
        "true" : y_scaler.inverse_transform(y_train_scaled).squeeze(),
        "prediction" : y_scaler.inverse_transform(mean_train.numpy()).squeeze()
    })

    test_results_df = pd.DataFrame({
        "true" : y_test.squeeze(),
        "prediction" : mean_test_rev.squeeze()
    })

    # train_results_df.to_csv(f"{save_folder}_train_vals.csv", index=False)
    # test_results_df.to_csv(f"{save_folder}_test_vals.csv", index=False)

    test_metrics_df = pd.DataFrame({
        "mae" : [mae],
        "rmse" : [rmse],
        "r2" : [r2]
    })

    # test_metrics_df.to_csv(f"{save_folder}_test_metrics.csv", index=False)
    print(test_metrics_df)

    # save optimized hyperparam of best model
    # param_dict = gpflow.utilities.parameter_dict(best_model)
    # with open(f"{save_folder}_param_dict.pkl", "wb") as f:
    #     pickle.dump(param_dict, f)


In [None]:
all_input_representation = ["mpnet", "distill_roberta"]
all_targets = ["mpnet_score", "distill_roberta_score"]

# Full
for input_representation in all_input_representation:
    print(input_representation)
    for target_representation in all_targets:
        # String, no OHE
        all_gp_inputs = np.array(results_df[f"question_and_context_{input_representation}"].tolist())
        fit_gp_function(results_df,
                all_gp_inputs,
                target = target_representation,
                save_folder=f"experiment_2/{target_representation}/full_input_{input_representation}_string",
                data_category="full",
                condition=None,
                )

        # String, OHE
        all_gp_inputs = np.hstack([
            np.array(results_df[f"question_and_context_{input_representation}"].tolist()),
            np.array(results_df["topic_OHE"].tolist())
            ])

        fit_gp_function(results_df,
                all_gp_inputs,
                target = target_representation,
                save_folder=f"experiment_2/{target_representation}/ohe/full_input_{input_representation}_string",
                data_category="full",
                condition=None,
                )

        # Concat, no ohe
        all_gp_inputs = np.hstack([
            np.array(results_df[f"question_{input_representation}"].tolist()),
            np.array(results_df[f"context_{input_representation}"].tolist()),
            ])

        fit_gp_function(
            results_df,
            all_gp_inputs,
            target = target_representation,
            save_folder=f"experiment_2/{target_representation}/full_input_{input_representation}_concat",
            data_category="full",
            condition=None,
                )

        # concat OHE
        all_gp_inputs = np.hstack([
            np.array(results_df[f"question_{input_representation}"].tolist()),
            np.array(results_df[f"context_{input_representation}"].tolist()),
            np.array(results_df["topic_OHE"].tolist())
            ])
        fit_gp_function(
            results_df,
            all_gp_inputs,
            target = target_representation,
            save_folder=f"experiment_2/{target_representation}/ohe/full_input_{input_representation}_concat",
            data_category="full",
            condition=None,
                )

In [None]:
all_input_representation = ["mpnet", "distill_roberta"]
all_targets = ["mpnet_score", "distill_roberta_score"]

# Answerable/Unanswerable, swap out condition
for input_representation in all_input_representation:
    print(input_representation)
    for target_representation in all_targets:
        # String, no OHE
        all_gp_inputs = np.array(results_df[results_df["true_answer"] != "Context does not contain the answer."][f"question_and_context_{input_representation}"].tolist())
        fit_gp_function(
            results_df,
            all_gp_inputs,
            target = target_representation,
            save_folder=f"experiment_2/{target_representation}/answerable_input_{input_representation}_string",
            data_category="answerable",
            condition= results_df["true_answer"] != "Context does not contain the answer.",
                )

        # String, OHE
        all_gp_inputs = np.hstack([
            np.array(results_df[results_df["true_answer"] != "Context does not contain the answer."][f"question_and_context_{input_representation}"].tolist()),
            np.array(results_df[results_df["true_answer"] != "Context does not contain the answer."]["topic_OHE"].tolist())
            ])

        fit_gp_function(results_df,
                all_gp_inputs,
                target = target_representation,
                save_folder=f"experiment_2/{target_representation}/ohe/answerable_input_{input_representation}_string",
                data_category="answerable",
                condition= results_df["true_answer"] != "Context does not contain the answer.",
                )

        # Concat, no ohe
        all_gp_inputs = np.hstack([
            np.array(results_df[results_df["true_answer"] != "Context does not contain the answer."][f"question_{input_representation}"].tolist()),
            np.array(results_df[results_df["true_answer"] != "Context does not contain the answer."][f"context_{input_representation}"].tolist()),
            ])

        fit_gp_function(results_df,
                all_gp_inputs,
                target = target_representation,
                save_folder=f"experiment_2/{target_representation}/answerable_input_{input_representation}_concat",
                data_category="answerable",
                condition= results_df["true_answer"] != "Context does not contain the answer.",
                )

        # concat OHE
        all_gp_inputs = np.hstack([
            np.array(results_df[results_df["true_answer"] != "Context does not contain the answer."][f"question_{input_representation}"].tolist()),
            np.array(results_df[results_df["true_answer"] != "Context does not contain the answer."][f"context_{input_representation}"].tolist()),
            np.array(results_df[results_df["true_answer"] != "Context does not contain the answer."]["topic_OHE"].tolist())
            ])
        fit_gp_function(results_df,
                all_gp_inputs,
                target = target_representation,
                save_folder=f"experiment_2/{target_representation}/ohe/answerable_input_{input_representation}_concat",
                data_category="answerable",
                condition= results_df["true_answer"] != "Context does not contain the answer.",
                )

In [None]:
# dummy regressor (predicts mean)
from sklearn.dummy import DummyRegressor
input_representation = "mpnet"
target_representation = "distill_roberta_score"
all_targets = results_df[target_representation].to_numpy().reshape(-1, 1)
title_col = results_df["topic"]
all_gp_inputs = np.hstack([
            np.array(results_df[f"question_{input_representation}"].tolist()),
            np.array(results_df[f"context_{input_representation}"].tolist()),
            ])

# 80-20 train-test split
x_train, x_test, y_train, y_test= train_test_split(all_gp_inputs, all_targets, test_size=0.20,
                                                   stratify=title_col, random_state=SEED)

dummy_reg = DummyRegressor(strategy="mean")
dummy_reg.fit(x_train, y_train.ravel())

mean_test = dummy_reg.predict(x_test).reshape(-1,1)

# calculate eval metrics
mae = mean_absolute_error(y_test.squeeze(), mean_test)
rmse = mean_squared_error(y_test.squeeze(), mean_test, squared=False)
r2 = r2_score(y_test.squeeze(), mean_test)

print(f"MAE : {mae}")
print(f"RMSE : {rmse}")
print(f"R2 : {r2}")

# PCA mini study


In [7]:
from sklearn.decomposition import PCA

def gp_model_init(x_train, y_train, lengthscale, alpha, noise):
    # Initilaise GP with data and hyperparameters provided
    return GPR(
        data=(x_train, y_train),
        kernel= RationalQuadratic(lengthscales=lengthscale, alpha=alpha),
        mean_function=Zero(), # zero mean function
        noise_variance=noise
    )

def fit_gp_function(results_df, all_gp_inputs, target, num_restarts=4, pca_study=None):
    best_model, best_log_marginal_likelihood_val = None, -np.inf
    print("PCA")
    # full data
    all_targets = results_df[target].to_numpy().reshape(-1, 1)
    title_col = results_df["topic"]

    # 80-20 trian-test split
    x_train, x_test, y_train, y_test= train_test_split(all_gp_inputs, all_targets, test_size=0.20, stratify=title_col, random_state=SEED)

    # scale
    x_scaler = StandardScaler()
    x_train_scaled = x_scaler.fit_transform(x_train)
    x_test_scaled = x_scaler.transform(x_test)

    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train)

    pca = PCA(n_components=pca_study)
    x_train_scaled = pca.fit_transform(x_train_scaled)
    x_test_scaled = pca.transform(x_test_scaled)

    for i in range(num_restarts):
        print(f"Restart {i}")
        # generate random initialisation vals
        lengthscale_init = stats.loguniform.rvs(0.01, 100)
        alpha_init = stats.loguniform.rvs(0.01, 100)
        noise_init = stats.loguniform.rvs(0.01, 100)

        # print(lengthscale_init, alpha_init, noise_init)

        # initialise Gp
        model = gp_model_init(x_train_scaled, y_train_scaled, lengthscale_init, alpha_init, noise_init)
        opt = gpflow.optimizers.Scipy()
        opt.minimize(model.training_loss, model.trainable_variables) # perform optimization
        log_marginal_likelihood_val = model.log_marginal_likelihood().numpy() # return log marginal likelihood
        print(log_marginal_likelihood_val)

        # update if higher log marginal likelihood found
        if log_marginal_likelihood_val > best_log_marginal_likelihood_val:
            best_log_marginal_likelihood_val = log_marginal_likelihood_val
            best_model = model

    mean_train, _ = best_model.predict_y(x_train_scaled)
    mean_test, _ = best_model.predict_y(x_test_scaled)

    mean_test_rev = y_scaler.inverse_transform(mean_test.numpy())

    # calculate eval metrics
    mae = mean_absolute_error(y_test.squeeze(), mean_test_rev)
    rmse = mean_squared_error(y_test.squeeze(), mean_test_rev, squared=False)
    r2 = r2_score(y_test.squeeze(), mean_test_rev)

    return rmse


In [None]:
pca_dimensions = [1, 200, 400, 600, 800, 1000, 1200, 1400, 1547]
rmse_results = []
input_representation = "mpnet"
all_gp_inputs = np.hstack([np.array(results_df[f"question_{input_representation}"].tolist()),
                           np.array(results_df[f"context_{input_representation}"].tolist()),
                           np.array(results_df["topic_OHE"].tolist())
])

for dim in pca_dimensions:
    rmse = fit_gp_function(results_df,
                           all_gp_inputs,
                           target="mpnet_score",
                           num_restarts=5,
                           pca_study=dim)
    rmse_results.append(rmse)

df_res = pd.DataFrame({
    "PCA_Dimensions" : pca_dimesions,
    "RMSE" : rmse_results
})
df_res.to_csv("pca_dimensions_vs_rmse.csv", index=False)

plt.figure(figsize=(10, 6))
plt.plot(pca_dimensions, rmse_results, marker='o')
plt.title('RMSE for varying input dimensions using PCA')
plt.xlabel('Number of input dimensions')
plt.ylabel('RMSE')
plt.grid(True)
plt.savefig("PCA_DIM_plot.pdf")
plt.show()

# Error Analysis

In [2]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
SEED = 2504

input_representation = "mpnet"
target_representation = "mpnet_score"

# read in predictions and data
train_preds = pd.read_csv(f"experiment_2/{target_representation}/ohe/full_input_{input_representation}_concat_train_vals.csv")
train_preds = pd.read_csv(f"experiment_2/{target_representation}/ohe/full_input_{input_representation}_concat_test_vals.csv")
results_df = pd.read_csv("data/selected_topics_embeddings_multiple_inputs.csv") # dataset for experiment 2 (11 topic dataset)

# read in numpy arrays correctly from csv
results_df["question_mpnet"] = [ast.literal_eval(x) for x in results_df["question_mpnet"]]
results_df["context_mpnet"] = [ast.literal_eval(x) for x in results_df["context_mpnet"]]

ohe_topics = pd.get_dummies(results_df["topic"], prefix="topic", dtype="int") # create OHE
results_df["topic_OHE"] = ohe_topics.values.tolist() # OHE vector

In [3]:
# input and target setup
all_gp_inputs = np.hstack([np.array(results_df[f"question_{input_representation}"].tolist()),
                           np.array(results_df[f"context_{input_representation}"].tolist()),
                           np.array(results_df["topic_OHE"].tolist())
])
all_gp_targets = results_df[target_representation].to_numpy().reshape(-1, 1)
topic_col = results_df["topic"]

# create train-test split to get test set
x_train, x_test, y_train, y_test, train_indices, test_indices = train_test_split(all_gp_inputs, all_gp_targets, np.arange(len(all_gp_targets)),
                                                    stratify=topic_col, test_size=0.2, random_state=SEED)

In [4]:
test_errors = train_preds["true"].to_numpy() - train_preds["prediction"].to_numpy()

# create dataframe with residuals
# replace commands help provide clean legends
test_res_df = pd.DataFrame({
    "topic" : [title_entry.replace(",", "").replace("_", " ") for title_entry in topic_col.iloc[test_indices]],
    "error" : test_errors,
    "absolute_error" : np.abs(test_errors),
    "squared_error" : np.square(test_errors),
})


In [5]:
## Study 1
# get mse and mae by topic
test_rmse_topic = np.sqrt(test_res_df.groupby("topic").mean()["squared_error"])
test_counts = test_res_df.groupby("topic").size() # get num of datapoints per topic

In [None]:
test_rmse_best = test_rmse_topic.idxmin()
test_rmse_worst = test_rmse_topic.idxmax()

# create plot
plt.figure(figsize=(7, 7))

bar_colours = ["green" if topic == test_rmse_best else ("red" if topic == test_rmse_worst else "grey") for topic in test_rmse_topic.index] # color worst in red, best in green, all others in grey
bar_chart = plt.bar(test_rmse_topic.index, test_rmse_topic.values, color=bar_colours)
plt.xlabel("Topic")
plt.ylabel("RMSE")
plt.xticks(test_rmse_topic.index, rotation=90, fontsize=7)
# add topic count
for bar, count in zip(bar_chart, test_counts.values):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(),
            f"{count}", ha='center', va='bottom', fontsize=7)

plt.title("Test RMSE breakdown by topic")
plt.tight_layout()
# plt.savefig("error_analysis_rmse_topic_breakdown.pdf")

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns
test_topics = test_res_df["topic"]
topic_colours = {topic:colour for topic,colour in zip(test_topics.unique(), sns.color_palette("tab20", 11))} # assign topic colour

test_embeddings_viz = PCA(n_components=2).fit_transform(x_test)
#
plt.figure(figsize=(8, 5))

for topic in test_topics.unique():
    topic_indices = np.where(test_topics == topic)[0] # get topic indices
    plt.scatter(test_embeddings_viz[topic_indices, 0], test_embeddings_viz[topic_indices, 1], label=topic, color=topic_colours[topic]) # plot 2d points coloured by topic

plt.xlabel("Principal component 1")
plt.ylabel("Principal component 2")
plt.legend(loc="best", bbox_to_anchor=(1, 1))
plt.grid(True)
plt.tight_layout()
plt.title("2D PCA visualisation of test embeddings by topic")
plt.savefig("2d_pca_visualisation.pdf")
plt.show()