In [None]:
'''
This notebook was run in a Google Colab environement.
This notebook contains the code for the first experiment
'''

'\nThis notebook was run in a Google Colab environement.\nThis notebook contains the code for the first experiment\n'

In [1]:
## Colab cell
# Mount drive
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/Research Project")
!pip3 install tensorflow
!pip3 install gpflow==2.9.1
# !pip3 install sentence-transformers
# !pip3 install langchain_text_splitters

Mounted at /content/drive
Collecting gpflow==2.9.1
  Downloading gpflow-2.9.1-py3-none-any.whl.metadata (13 kB)
Collecting check-shapes>=1.0.0 (from gpflow==2.9.1)
  Downloading check_shapes-1.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting deprecated (from gpflow==2.9.1)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Collecting dropstackframe>=0.1.0 (from check-shapes>=1.0.0->gpflow==2.9.1)
  Downloading dropstackframe-0.1.1-py3-none-any.whl.metadata (4.3 kB)
Collecting lark<2.0.0,>=1.1.0 (from check-shapes>=1.0.0->gpflow==2.9.1)
  Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)
Downloading gpflow-2.9.1-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.6/380.6 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading check_shapes-1.1.1-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Depreca

# Generate embeddings

In [None]:
# read in data
import pandas as pd
import ast
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from sentence_transformers import SentenceTransformer
results_df = pd.read_csv("data/mistral_results_run2.csv")[:7500] # only considering 7500 points

# preprocessing of true answer
def preprocess_true_answer(row):
    '''
    Reads in the true answer column correctly. It is written as a numpy array containing a string, as csv reads in numpy as a string itself.
    "['e.g']" -> 'e.g'
    '''
    convert_from_string = ast.literal_eval(row["true_answer"])
    if len(convert_from_string) == 0:
        convert_from_string = ["Context does not contain the answer."]

    return convert_from_string[0]

def preprocess_llm_answer(row):
    llm_answer = row["llm_answer"]
    if "Context does not contain the answer" in llm_answer:
        llm_answer = "Context does not contain the answer."
    return llm_answer

results_df["true_answer"] = results_df.apply(preprocess_true_answer, axis=1)
results_df["llm_answer"] = results_df.apply(preprocess_llm_answer, axis=1)

In [None]:
# Remove long question+contexts

text_splitter = SentenceTransformersTokenTextSplitter(model_name="sentence-transformers/all-mpnet-base-v2") # tokenise according to mpnet-base ST
all_questions = results_df["question"].tolist()
all_contexts = results_df["context"].tolist()
all_question_context_combined = [f"{question} {context}" for question,context in zip(all_questions, all_contexts)] # string strategy

num_tokens_question_context_combined = [text_splitter.count_tokens(text=entry) - 2 for entry in all_question_context_combined] # count tokens for string strategy
indices_too_long = [index for index, token_count in enumerate(num_tokens_question_context_combined) if token_count > 382] # indices that have a token length of > 384 and hence will be truncated

results_df = results_df.drop(indices_too_long).reset_index(drop=True)
results_df.shape

In [None]:
text_splitter = SentenceTransformersTokenTextSplitter(model_name="sentence-transformers/all-distilroberta-v1") # tokenise according to distilroberta ST
all_questions = results_df["question"].tolist()
all_contexts = results_df["context"].tolist()
all_question_context_combined = [f"{question} {context}" for question,context in zip(all_questions, all_contexts)] # string strategy

num_tokens_question_context_combined = [text_splitter.count_tokens(text=entry) - 2 for entry in all_question_context_combined] # count tokens for string strategy
indices_too_long = [index for index, token_count in enumerate(num_tokens_question_context_combined) if token_count > 510] # indices that have a token length of > 384 and hence will be truncated
results_df = results_df.drop(indices_too_long).reset_index(drop=True)

print(results_df.shape)

In [None]:
def get_embedding(sentences_list, model):
    '''
    This function produces an embedding for a list of sentences that is provided to the function.
    '''
    pool = model.start_multi_process_pool()
    embeddings = model.encode_multi_process(sentences_list, pool)
    model.stop_multi_process_pool(pool)
    return embeddings.tolist()

## Create input embeddings
all_questions = results_df["question"].tolist()
all_contexts = results_df["context"].tolist()
all_question_context_combined = [f"{question} {context}" for question,context in zip(all_questions, all_contexts)]

# load sentence transformer model
model_list = {
    "mpnet" : SentenceTransformer("all-mpnet-base-v2"),
    "distill_roberta" : SentenceTransformer("all-distilroberta-v1")
}

for input_representation in ["mpnet", "distill_roberta"]:
    print("Input ", input_representation)
    # string strat
    results_df[f"question_and_context_{input_representation}"] = get_embedding(all_question_context_combined, model_list[input_representation])

    # concat strat
    results_df[f"question_{input_representation}"] = get_embedding(all_questions, model_list[input_representation])
    results_df[f"context_{input_representation}"] = get_embedding(all_contexts, model_list[input_representation])

In [6]:
def calc_similarity_score_column(column1, column2, model):
    '''
    Calculate the similarity scores (pairwise) between the first and second column. (dataset answer and llm answer)
    column1 and column2 is list of strings.
    '''
    column1_embedding = model.encode(column1)
    column2_embedding = model.encode(column2)
    similarity_scores = model.similarity_pairwise(column1_embedding, column2_embedding)
    return similarity_scores.numpy()

# calculate targets
all_true_answers_list = results_df["true_answer"].tolist()
all_llm_answers_list = results_df["llm_answer"].tolist()

for target_representation in ["mpnet", "distill_roberta"]:
    results_df[f"{target_representation}_score"] = calc_similarity_score_column(all_true_answers_list, all_llm_answers_list, model_list[target_representation])


In [7]:
results_df.to_csv("data/fullsubset_embeddings_multiple_inputs.csv", index=False) # save to csv

# Fit GP

In [2]:
# Imports
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from gpflow.models import GPR
from gpflow.kernels import RationalQuadratic
from gpflow.mean_functions import Zero
import matplotlib.pyplot as plt
import time
import pickle
import scipy.stats as stats
import gpflow

SEED = 2504

# read in generated embeddings and targets
results_df = pd.read_csv("data/fullsubset_embeddings_multiple_inputs.csv") # Dataset

## correctly read in saved embeddings
# mpnet embeddings
results_df["question_mpnet"] = [ast.literal_eval(x) for x in results_df["question_mpnet"]]
results_df["context_mpnet"] = [ast.literal_eval(x) for x in results_df["context_mpnet"]]
results_df["question_and_context_mpnet"] = [ast.literal_eval(x) for x in results_df["question_and_context_mpnet"]]

# # distill_roberta_embeddings
results_df["question_distill_roberta"] = [ast.literal_eval(x) for x in results_df["question_distill_roberta"]]
results_df["context_distill_roberta"] = [ast.literal_eval(x) for x in results_df["context_distill_roberta"]]
results_df["question_and_context_distill_roberta"] = [ast.literal_eval(x) for x in results_df["question_and_context_distill_roberta"]]


In [None]:
def preds_vs_truth_plot(train_true_vals, train_pred_vals, test_true_vals, test_pred_vals, file_name=None):
    fig, axs = plt.subplots(1, 2, figsize=(12,8))

    ## plot train data
    axs[0].scatter(train_true_vals, train_pred_vals, alpha=0.4, label=f"Train data", color="blue") # plot train points

    # plot line of perfec predictions for comparison
    min_value_train = min(np.min(train_true_vals), np.min(train_pred_vals))
    max_value_train = max(np.max(train_true_vals), np.max(train_pred_vals))
    axs[0].plot(np.arange(min_value_train, max_value_train, step=0.001),
                np.arange(min_value_train, max_value_train, step=0.001),
                label="Perfect predictions", color="red")
    axs[0].set_xlabel("True score")
    axs[0].set_ylabel("Predicted score")
    axs[0].legend(fontsize="x-small")
    axs[0].set_title("Train", fontsize=10)

    ## plot test data
    axs[1].scatter(test_true_vals, test_pred_vals, alpha=0.4, label=f"Train data", color="blue") # plot test points

    # plot line of perfect predictions for comparison
    min_val_test = min(np.min(test_true_vals), np.min(test_pred_vals))
    max_val_test = max(np.max(test_true_vals), np.max(test_pred_vals))

    axs[1].plot(np.arange(min_val_test, max_val_test, step=0.001),
                np.arange(min_val_test, max_val_test, step=0.001),
                label="Perfect predictions", color="red")
    axs[1].set_xlabel("True score")
    axs[1].set_ylabel("Predicted score")
    axs[1].legend(fontsize="x-small")
    axs[1].set_title("Test", fontsize=10)

    plt.tight_layout()
    if file_name is not None:
        plt.savefig(file_name)

    plt.show()

In [3]:
def gp_model_init(x_train, y_train, lengthscale, alpha, noise):
    # Initilaise GP with data and hyperparameters provided

    return GPR(
        data=(x_train, y_train),
        kernel= RationalQuadratic(lengthscales=lengthscale, alpha=alpha),
        mean_function=Zero(), # zero mean function
        noise_variance=noise
    )

def fit_gp_function(results_df, all_gp_inputs, target, save_folder, num_restarts=20):
    best_model, best_log_marginal_likelihood_val = None, -np.inf

    all_targets = results_df[target].to_numpy().reshape(-1, 1)

    # 80-20 train-test split
    x_train, x_test, y_train, y_test= train_test_split(all_gp_inputs, all_targets, test_size=0.20, random_state=SEED)

    # scale data
    x_scaler = StandardScaler()
    x_train_scaled = x_scaler.fit_transform(x_train)
    x_test_scaled = x_scaler.transform(x_test)

    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train)

    # multiple restarts
    for i in range(num_restarts):
        print(f"Restart {i}")
        # generate random initialisation vals
        lengthscale_init = stats.loguniform.rvs(0.01, 100)
        alpha_init = stats.loguniform.rvs(0.01, 100)
        noise_init = stats.loguniform.rvs(0.01, 100)

        # initialise Gp
        model = gp_model_init(x_train_scaled, y_train_scaled, lengthscale_init, alpha_init, noise_init)
        opt = gpflow.optimizers.Scipy() # create optimizer
        opt.minimize(model.training_loss, model.trainable_variables) # perform optimization

        log_marginal_likelihood_val = model.log_marginal_likelihood().numpy() # get log marginal likelihood

        # update if higher log marginal likelihood found
        if log_marginal_likelihood_val > best_log_marginal_likelihood_val:
            best_log_marginal_likelihood_val = log_marginal_likelihood_val
            best_model = model

    # get predictions
    mean_train, _ = best_model.predict_y(x_train_scaled)
    mean_test, _ = best_model.predict_y(x_test_scaled)

    mean_test_rev = y_scaler.inverse_transform(mean_test.numpy()) # reverse transform, interpretability reason

    # calculate eval metrics
    mae = mean_absolute_error(y_test.squeeze(), mean_test_rev)
    rmse = mean_squared_error(y_test.squeeze(), mean_test_rev, squared=False)
    r2 = r2_score(y_test.squeeze(), mean_test_rev)

    # preds_vs_truth_plot(y_scaler.inverse_transform(y_train_scaled), y_scaler.inverse_transform(mean_train.numpy()),
    #                     y_test, mean_test_rev)

    # save preds and metrics
    train_results_df = pd.DataFrame({
        "true" : y_scaler.inverse_transform(y_train_scaled).squeeze(),
        "prediction" : y_scaler.inverse_transform(mean_train.numpy()).squeeze()
    })

    test_results_df = pd.DataFrame({
        "true" : y_test.squeeze(),
        "prediction" : mean_test_rev.squeeze()
    })

    # train_results_df.to_csv(f"{save_folder}_train_vals.csv", index=False)
    # test_results_df.to_csv(f"{save_folder}_test_vals.csv", index=False)

    test_metrics_df = pd.DataFrame({
        # "best_marginal_likelihood" : [best_log_marginal_likelihood]
        "mae" : [mae],
        "rmse" : [rmse],
        "r2" : [r2]
    })

    # test_metrics_df.to_csv(f"{save_folder}_test_metrics.csv", index=False)
    print(test_metrics_df)

    # save optimized hyperparam of best model
    # param_dict = gpflow.utilities.parameter_dict(best_model)
    # with open(f"{save_folder}_param_dict.pkl", "wb") as f:
    #     pickle.dump(param_dict, f)


In [None]:
all_input_representation = ["mpnet", "distill_roberta"]
all_targets = ["mpnet_score", "distill_roberta_score"]

for input_representation in all_input_representation:
    print(input_representation)
    for target_representation in all_targets:
        print(input_representation, target_representation)
        # String
        all_gp_inputs = np.array(results_df[f"question_and_context_{input_representation}"].tolist())

        fit_gp_function(results_df,
                all_gp_inputs,
                target = target_representation,
                save_folder=f"experiment_1/{target_representation}/input_{input_representation}_string"
                )

        # Concat
        all_gp_inputs = np.hstack([
            np.array(results_df[f"question_{input_representation}"].tolist()),
            np.array(results_df[f"context_{input_representation}"].tolist()),
            ])

        fit_gp_function(results_df,
                all_gp_inputs,
                target = target_representation,
                save_folder=f"experiment_1/{target_representation}/input_{input_representation}_concat"
                )

In [None]:
# dummy regressor (predicts mean)
from sklearn.dummy import DummyRegressor
input_representation = "mpnet"
target_representation = "mpnet_score"
all_targets = results_df[target_representation].to_numpy().reshape(-1, 1)

all_gp_inputs = np.hstack([
            np.array(results_df[f"question_{input_representation}"].tolist()),
            np.array(results_df[f"context_{input_representation}"].tolist()),
            ])

# 80-20 train-test split
x_train, x_test, y_train, y_test= train_test_split(all_gp_inputs, all_targets, test_size=0.20, random_state=SEED)

dummy_reg = DummyRegressor(strategy="mean")
dummy_reg.fit(x_train, y_train.ravel())

mean_test = dummy_reg.predict(x_test).reshape(-1,1)

# calculate eval metrics
mae = mean_absolute_error(y_test.squeeze(), mean_test)
rmse = mean_squared_error(y_test.squeeze(), mean_test, squared=False)
r2 = r2_score(y_test.squeeze(), mean_test)

print(f"MAE : {mae}")
print(f"RMSE : {rmse}")
print(f"R2 : {r2}")