In [1]:
from pathlib import Path
import pandas as pd
import json
from scipy.spatial.distance import cosine, euclidean, cityblock, chebyshev

In [2]:
def string_to_vector(s):
    return [float(x) for x in s.replace('[', '').replace(']', '').split(",")]

# Find inference time of ChatGPT

In [3]:
chatgpt_log_path = Path('../data/chatgpt/logs')

In [4]:
chatgpt_log_files = list(chatgpt_log_path.glob('*.log'))

In [5]:
def extract_time_from_line(line):
    if 'Time taken:' in line:
        return float(line.split('Time taken:')[1].split('s')[0])
    else:
        return None

In [6]:
def extract_time_from_file(file):
    with open(file, 'r') as f:
        lines = f.readlines()
    times = [extract_time_from_line(line) for line in lines]
    times = [time for time in times if time is not None]
    return times

In [7]:
chatgpt_times = [extract_time_from_file(file) for file in chatgpt_log_files]

In [8]:
chatgpt_times = [time for times in chatgpt_times for time in times]

In [9]:
chatgpt_times = pd.Series(chatgpt_times)

In [10]:
chatgpt_times.describe()

count    124253.000000
mean          1.590521
std           1.465772
min           0.566465
25%           1.001180
50%           1.253083
75%           1.779438
max          65.057091
dtype: float64

In [11]:
test_set_path = Path('../data/test_small.csv')
test_df = pd.read_csv(test_set_path)

In [12]:
def analyse(output_path):
    times = []
    performance = {
        'cosine': [],
        'euclidean': [],
        'cityblock': [],
        'chebyshev': []
    }
    for index, row in test_df.iterrows():
        file = output_path / f'{index}.json'
        with open(file, 'r') as f:
            generated_message = json.load(f)
            times.append(generated_message['time'])
            if generated_message.get('embedding', None) is not None:
                embedding = generated_message['embedding']
                gt_embedding = string_to_vector(row['message_embedding'])
                performance['cosine'].append(cosine(embedding, gt_embedding))
                performance['euclidean'].append(euclidean(embedding, gt_embedding))
                performance['cityblock'].append(cityblock(embedding, gt_embedding))
                performance['chebyshev'].append(chebyshev(embedding, gt_embedding))
            else:
                performance['cosine'].append(generated_message['cosine'])
                performance['euclidean'].append(generated_message['euclidean'])
                performance['cityblock'].append(generated_message['cityblock'])
                performance['chebyshev'].append(generated_message['chebyshev'])
    times = pd.Series(times)
    performance['cosine'] = pd.Series(performance['cosine'])
    performance['euclidean'] = pd.Series(performance['euclidean'])
    performance['cityblock'] = pd.Series(performance['cityblock'])
    performance['chebyshev'] = pd.Series(performance['chebyshev'])
    return times, performance

# Accuracy and Inference time for Vector Database

In [13]:
vector_output_path = Path('./outputs/vectordb')

In [14]:
vector_times, vector_performance = analyse(vector_output_path)

In [15]:
vector_times.describe()

count    2425.000000
mean        0.387326
std         0.006909
min         0.184687
25%         0.384767
50%         0.386977
75%         0.389801
max         0.412625
dtype: float64

In [16]:
vector_performance['cosine'].describe()

count    2425.000000
mean        0.062036
std         0.035696
min         0.000000
25%         0.037643
50%         0.067893
75%         0.088319
max         0.193673
dtype: float64

In [17]:
vector_performance['euclidean'].describe()

count    2425.000000
mean        0.321653
std         0.143598
min         0.000000
25%         0.274383
50%         0.368491
75%         0.420284
max         0.622372
dtype: float64

In [18]:
vector_performance['cityblock'].describe()

count    2425.000000
mean       10.049913
std         4.489248
min         0.000000
25%         8.524065
50%        11.549316
75%        13.131390
max        19.358547
dtype: float64

In [19]:
vector_performance['chebyshev'].describe()

count    2425.000000
mean        0.029302
std         0.013369
min         0.000000
25%         0.024413
50%         0.033378
75%         0.037812
max         0.072392
dtype: float64

# Accuracy and Inference time for Vector Database

In [20]:
tiny_vector_output_path = Path('./outputs/vectordb_tiny')

In [21]:
tiny_vector_times, tiny_vector_performance = analyse(tiny_vector_output_path)

In [22]:
tiny_vector_times.describe()

count    2425.000000
mean        0.385622
std         0.004909
min         0.342071
25%         0.382580
50%         0.384990
75%         0.388215
max         0.410138
dtype: float64

In [23]:
tiny_vector_performance['cosine'].describe()

count    2425.000000
mean        0.064629
std         0.034771
min         0.000000
25%         0.041816
50%         0.070616
75%         0.090027
max         0.193673
dtype: float64

In [24]:
tiny_vector_performance['euclidean'].describe()

count    2425.000000
mean        0.332816
std         0.136014
min         0.000000
25%         0.289192
50%         0.375809
75%         0.424328
max         0.622372
dtype: float64

In [25]:
tiny_vector_performance['cityblock'].describe()

count    2425.000000
mean       10.401247
std         4.253315
min         0.000000
25%         9.050577
50%        11.695378
75%        13.297423
max        19.358547
dtype: float64

In [26]:
tiny_vector_performance['chebyshev'].describe()

count    2425.000000
mean        0.030271
std         0.012690
min         0.000000
25%         0.025429
50%         0.033900
75%         0.038155
max         0.072392
dtype: float64

# Accuracy and Inference time for Base Model (without Distillation)

In [27]:
base_output_path = Path('./outputs/embeddings/flan-t5-base')

In [28]:
base_times, base_performance = analyse(base_output_path)

In [29]:
base_times.describe()

count    2425.000000
mean        0.283156
std         0.268420
min         0.225191
25%         0.239736
50%         0.248010
75%         0.267751
max         6.839332
dtype: float64

In [30]:
base_performance['cosine'].describe()

count    2425.000000
mean        0.277940
std         0.027998
min         0.234319
25%         0.255328
50%         0.269972
75%         0.289578
max         0.361683
dtype: float64

In [31]:
base_performance['euclidean'].describe()

count    2425.000000
mean        0.744670
std         0.036702
min         0.684571
25%         0.714602
50%         0.734809
75%         0.761023
max         0.850509
dtype: float64

In [32]:
base_performance['cityblock'].describe()

count    2425.000000
mean       23.101499
std         1.159753
min        21.164160
25%        22.225591
50%        22.803098
75%        23.565779
max        26.280348
dtype: float64

In [33]:
base_performance['chebyshev'].describe()    

count    2425.000000
mean        0.074263
std         0.011513
min         0.056066
25%         0.066694
50%         0.070592
75%         0.075539
max         0.149704
dtype: float64

# Accuracy and Inference time for Distilled Model

In [34]:
distilled_output_path = Path('./outputs/embeddings/checkpoint-10000')

In [35]:
distilled_times, distilled_performance = analyse(distilled_output_path)

In [36]:
distilled_times.describe()

count    2425.000000
mean        0.303202
std         0.553354
min         0.223693
25%         0.238294
50%         0.246649
75%         0.262651
max        15.278507
dtype: float64

In [37]:
distilled_performance['cosine'].describe()

count    2425.000000
mean        0.058620
std         0.034661
min         0.000000
25%         0.032801
50%         0.066776
75%         0.082258
max         0.164653
dtype: float64

In [38]:
distilled_performance['euclidean'].describe()

count    2425.000000
mean        0.310189
std         0.145026
min         0.000000
25%         0.256127
50%         0.365448
75%         0.405607
max         0.573852
dtype: float64

In [39]:
distilled_performance['cityblock'].describe()

count    2425.000000
mean        9.699162
std         4.539895
min         0.000000
25%         8.013645
50%        11.500945
75%        12.684517
max        17.832085
dtype: float64

In [40]:
distilled_performance['chebyshev'].describe()

count    2425.000000
mean        0.027921
std         0.013208
min         0.000000
25%         0.021997
50%         0.032793
75%         0.035799
max         0.058922
dtype: float64

# Accuracy and Inference time for Hybird Approach

In [41]:
hybrid_output_path = Path('./outputs/hybrid')

In [42]:
hybrid_times, hybrid_performance = analyse(hybrid_output_path)

In [43]:
hybrid_times.describe()

count    2425.000000
mean        0.463327
std         0.093013
min         0.257936
25%         0.381647
50%         0.423636
75%         0.549353
max         1.878136
dtype: float64

In [44]:
hybrid_performance['cosine'].describe()

count    2425.000000
mean        0.058262
std         0.035994
min         0.000000
25%         0.030209
50%         0.066739
75%         0.083725
max         0.193673
dtype: float64

In [45]:
hybrid_performance['euclidean'].describe()

count    2425.000000
mean        0.307085
std         0.149107
min         0.000000
25%         0.245800
50%         0.365346
75%         0.409206
max         0.622372
dtype: float64

In [46]:
hybrid_performance['cityblock'].describe()

count    2425.000000
mean        9.598026
std         4.663471
min         0.000000
25%         7.681106
50%        11.439487
75%        12.795822
max        19.358547
dtype: float64

In [47]:
hybrid_performance['chebyshev'].describe()

count    2425.000000
mean        0.027788
std         0.013732
min         0.000000
25%         0.021373
50%         0.032507
75%         0.036307
max         0.072392
dtype: float64

In [48]:
# Combine results into a summarized dataframe
result_obj = {
    'Model': ['ChatGPT', 'VectorDB', 'VectorDB Tiny', 'Base Model', 'Distilled Model', 'Hybrid Approach'],
    'Inference Time (Mean)': [
        chatgpt_times.describe()['mean'],
        vector_times.describe()['mean'],
        tiny_vector_times.describe()['mean'],
        base_times.describe()['mean'],
        distilled_times.describe()['mean'],
        hybrid_times.describe()['mean']
    ],
    'Inference Time (Std)': [
        chatgpt_times.describe()['std'],
        vector_times.describe()['std'],
        tiny_vector_times.describe()['std'],
        base_times.describe()['std'],
        distilled_times.describe()['std'],
        hybrid_times.describe()['std']
    ],
    'Cosine (Mean)': [
        None,
        vector_performance['cosine'].describe()['mean'],
        tiny_vector_performance['cosine'].describe()['mean'],
        base_performance['cosine'].describe()['mean'],
        distilled_performance['cosine'].describe()['mean'],
        hybrid_performance['cosine'].describe()['mean']
    ],
    'Consine (Std)': [
        None,
        vector_performance['cosine'].describe()['std'],
        tiny_vector_performance['cosine'].describe()['std'],
        base_performance['cosine'].describe()['std'],
        distilled_performance['cosine'].describe()['std'],
        hybrid_performance['cosine'].describe()['std']
    ],
    'Euclidean (Mean)': [
        None,
        vector_performance['euclidean'].describe()['mean'],
        tiny_vector_performance['euclidean'].describe()['mean'],
        base_performance['euclidean'].describe()['mean'],
        distilled_performance['euclidean'].describe()['mean'],
        hybrid_performance['euclidean'].describe()['mean']
    ],
    'Euclidean (Std)': [
        None,
        vector_performance['euclidean'].describe()['std'],
        tiny_vector_performance['euclidean'].describe()['std'],
        base_performance['euclidean'].describe()['std'],
        distilled_performance['euclidean'].describe()['std'],
        hybrid_performance['euclidean'].describe()['std']
    ],
    'Cityblock (Mean)': [
        None,
        vector_performance['cityblock'].describe()['mean'],
        tiny_vector_performance['cityblock'].describe()['mean'],
        base_performance['cityblock'].describe()['mean'],
        distilled_performance['cityblock'].describe()['mean'],
        hybrid_performance['cityblock'].describe()['mean']
    ],
    'Cityblock (Std)': [
        None,
        vector_performance['cityblock'].describe()['std'],
        tiny_vector_performance['cityblock'].describe()['std'],
        base_performance['cityblock'].describe()['std'],
        distilled_performance['cityblock'].describe()['std'],
        hybrid_performance['cityblock'].describe()['std']
    ],
    'Chebyshev (Mean)': [
        None,
        vector_performance['chebyshev'].describe()['mean'],
        tiny_vector_performance['chebyshev'].describe()['mean'],
        base_performance['chebyshev'].describe()['mean'],
        distilled_performance['chebyshev'].describe()['mean'],
        hybrid_performance['chebyshev'].describe()['mean']
    ],
    'Chebyshev (Std)': [
        None,
        vector_performance['chebyshev'].describe()['std'],
        tiny_vector_performance['chebyshev'].describe()['std'],
        base_performance['chebyshev'].describe()['std'],
        distilled_performance['chebyshev'].describe()['std'],
        hybrid_performance['chebyshev'].describe()['std']
    ]
}

In [49]:
result_df = pd.DataFrame(result_obj)

In [50]:
result_df.to_csv('./outputs/results.csv', index=False)