In [1]:
import datetime
import json
import sys
sys.path.insert(0, '../src')

import warnings
from multiprocessing import Pool
from threading import Thread

import jsonlines
import pandas as pd
import seaborn as sns
from fuzzywuzzy import process
from intents import *
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from p_tqdm import p_umap
from sklearn.metrics import (accuracy_score, confusion_matrix,
                             precision_recall_fscore_support)
from tqdm import tqdm
from utils import print_confusion_matrix
from IPython.display import clear_output
import random
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
tqdm.pandas()

  from pandas import Panel


In [2]:
verloop_intents = json.load(open("../data/verloop_intents.json"))

In [3]:
df_preds = pd.read_csv("../data/predictions_with_groundtruth.csv")
df_preds = df_preds[df_preds["MessageId"] != "zoJ8RNwzaLxPTzW9x"] # FALSE as a intent

In [4]:
def load_and_merge(field:str)->pd.DataFrame:
    output = []
    for obj in jsonlines.open(f"../data/davinci_{field}.jsonl"):
        output.append(obj)
    df_out = pd.DataFrame(output)
    df_out = df_out[["MessageId", "Message", "predicted", "logprobs"]].merge(
        df_preds[["MessageId", "VerdanPrediction", "Ground Truth"]], how="left"
    )
    clear_output()
    return df_out

In [5]:
def measure_pref(df_out):
    le.fit(df_out["Ground Truth"])
    y_pred_gpt = le.transform(df_out["predicted"])
    y_pred_verdan = le.transform(df_out["VerdanPrediction"])
    y = le.transform(df_out["Ground Truth"])
    print(f"Number of samples : {len(df_out)}")
    print("Verdan Scores")
    print_scores(y_pred_verdan, y)
    print("GPT3 Scores")
    print_scores(y_pred_gpt, y)
    
def print_scores(y_pred, y):
    print(f'Accuracy : {accuracy_score(y_pred, y)}')
    print(f'PRF : {precision_recall_fscore_support(y_pred, y, average="micro")}')

In [7]:
field = "Refunds"
df_out = load_and_merge(field) 
df_out["predicted"] = df_out["predicted"].apply(lambda x : x if x==field else f"Not {field}")
df_out["VerdanPrediction"] = df_out["VerdanPrediction"].apply(lambda x : x if x==field else f"Not {field}")
df_out["Ground Truth"] = df_out["Ground Truth"].apply(lambda x : x if x==field else f"Not {field}")
measure_pref(df_out)

Number of samples : 3728
Verdan Scores
Accuracy : 0.9670064377682404
PRF : (0.9670064377682404, 0.9670064377682404, 0.9670064377682404, None)
GPT3 Scores
Accuracy : 0.7288090128755365
PRF : (0.7288090128755365, 0.7288090128755365, 0.7288090128755363, None)


In [9]:
field = "OrderDelivery"
df_out = load_and_merge(field)
df_out["predicted"] = df_out["predicted"].apply(lambda x : x if x==field else f"Not {field}")
df_out["VerdanPrediction"] = df_out["VerdanPrediction"].apply(lambda x : x if x==field else f"Not {field}")
df_out["Ground Truth"] = df_out["Ground Truth"].apply(lambda x : x if x==field else f"Not {field}")
measure_pref(df_out)

Number of samples : 3732
Verdan Scores
Accuracy : 0.979903536977492
PRF : (0.979903536977492, 0.979903536977492, 0.979903536977492, None)
GPT3 Scores
Accuracy : 0.7130225080385852
PRF : (0.7130225080385852, 0.7130225080385852, 0.7130225080385851, None)


In [10]:
field = "Cancellations"
df_out = load_and_merge(field)
df_out["predicted"] = df_out["predicted"].apply(lambda x : x if x==field else f"Not {field}")
df_out["VerdanPrediction"] = df_out["VerdanPrediction"].apply(lambda x : x if x==field else f"Not {field}")
df_out["Ground Truth"] = df_out["Ground Truth"].apply(lambda x : x if x==field else f"Not {field}")
measure_pref(df_out)

Number of samples : 3730
Verdan Scores
Accuracy : 0.9828418230563003
PRF : (0.9828418230563003, 0.9828418230563003, 0.9828418230563003, None)
GPT3 Scores
Accuracy : 0.9597855227882037
PRF : (0.9597855227882037, 0.9597855227882037, 0.9597855227882037, None)


In [11]:
field = "OrderReceiving"
df_out = load_and_merge(field)
df_out["predicted"] = df_out["predicted"].apply(lambda x : x if x==field else f"Not {field}")
df_out["VerdanPrediction"] = df_out["VerdanPrediction"].apply(lambda x : x if x==field else f"Not {field}")
df_out["Ground Truth"] = df_out["Ground Truth"].apply(lambda x : x if x==field else f"Not {field}")
measure_pref(df_out)

Number of samples : 3730
Verdan Scores
Accuracy : 0.9780160857908847
PRF : (0.9780160857908847, 0.9780160857908847, 0.9780160857908847, None)
GPT3 Scores
Accuracy : 0.9581769436997319
PRF : (0.9581769436997319, 0.9581769436997319, 0.9581769436997319, None)


In [12]:
field = "OrderPlacement"
df_out = load_and_merge(field)
df_out["predicted"] = df_out["predicted"].apply(lambda x : x if x==field else f"Not {field}")
df_out["VerdanPrediction"] = df_out["VerdanPrediction"].apply(lambda x : x if x==field else f"Not {field}")
df_out["Ground Truth"] = df_out["Ground Truth"].apply(lambda x : x if x==field else f"Not {field}")
measure_pref(df_out)

Number of samples : 3270
Verdan Scores
Accuracy : 0.9721712538226299
PRF : (0.9721712538226299, 0.9721712538226299, 0.9721712538226299, None)
GPT3 Scores
Accuracy : 0.972782874617737
PRF : (0.972782874617737, 0.972782874617737, 0.972782874617737, None)
