In [None]:
# Model imports
from transformers import pipeline as pip

# Data handling imports
import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings("ignore")

In [None]:
# Read the data
data = pd.read_csv('../data/clean/sustainability-report-2020-NLB-reviewed.csv', sep=';', encoding='utf-8')

In [None]:
# Drop rows that have nan values in any column except LABEL
data = data.dropna(subset=['answer'], how='any')
data = data.dropna(subset=['context'], how='any')
data = data.dropna(subset=['question'], how='any')

In [None]:
# Load a model
qa = pip('question-answering', device=0)

In [None]:
# Sanity check the performance
qa(data['question'][1200], data['context'][1200])

In [None]:
# Join all the contexts into one string
full_context = ". ".join(data.context)

In [None]:
# Make a prediction for each question on the full context - sliding window approach
for index, row in data.iterrows():
    data.loc[index, "prediction"] = qa(row['question'], full_context)['answer']
    #print(f"Done with question {index} out of {len(data)}")

In [None]:
# Sentence transformer imports
from sentence_transformers import SentenceTransformer, util

In [None]:
# Load a sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# Calculate the similarity score between the prediction and the answer based on embeddings (Bertscore)
data['similarity_score'] = data.apply(lambda row: util.pytorch_cos_sim(*model.encode([row.prediction, row.answer])).item(), axis=1)

In [None]:
# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

In [None]:
# Make a histogram plot with a density of similarity scores
ax = sns.histplot(data=data, x="similarity_score", bins=20, kde=True)
# Mark the mean
ax.axvline(data['similarity_score'].mean(), color='r', linestyle='--')
# Show the plot
plt.title('Similarity score distribution')
plt.show()

In [None]:
# Plot only the ones with LABEL not nan
ax = sns.histplot(data=data[data['LABEL'].notna()], x="similarity_score", bins=20, kde=True)
# Mark the mean
ax.axvline(data['similarity_score'].mean(), color='r', linestyle='--')
# Show the plot
plt.title('Similarity score distribution - LABEL = OK')
plt.show()