In [1]:
import json
import pandas as pd



In [2]:
with open('inference_time_tests.json', 'r') as file:
    inf_time_tests = json.load(file)

def dict_to_interpretable_string(d):
    parts = []
    for key, value in d.items():
        # Convert the value to a string
        value_str = str(value)
        # Add quotes around string values for clarity
        if isinstance(value, str):
            value_str = f"'{value_str}'"
        # Append the formatted key-value pair to the parts list
        parts.append(f"{key}: {value_str}")
    # Join all parts into a single string
    return ', '.join(parts)

df = {
    'model_loading_params' : [],
    'model_checkpoint' : [],
    'gpu' : [],
    'model_dtype' : [],
    'model_device' : [],
    'avg_token_per_sec' : [],
    'model_loading_time' : [],
    'total_gpu_mem_gb' : [],
    'occupied_gpu_mem_gb' : [],
    'total_mem_gb' : [],
    'occupied_mem_gb' : [],
}

for item in inf_time_tests:
    gpu = item['model_loading_params'].get('gpu', 't4_2x')
    item['model_loading_params'].pop('gpu', None)
    model_loading_params_str = dict_to_interpretable_string(item['model_loading_params'])

    avg_tokens_per_sec = 0
    for sample_metric in item['sample_metrics']:
        avg_tokens_per_sec += sample_metric['output_tokens'] / (sample_metric['gen_latency'] / 1000)
    avg_tokens_per_sec /= len(item['sample_metrics'])

    print(model_loading_params_str)
    print(avg_tokens_per_sec)

    print('\n')

    df['model_loading_params'].append(model_loading_params_str)
    df['model_checkpoint'].append(item['model_loading_params'].get('pretrained_model_name_or_path', item['model_loading_params'].get('model_name_or_path')))
    df['gpu'].append(gpu)
    df['avg_token_per_sec'].append(avg_tokens_per_sec)
    avl_gpu_mem, occupied_gpu_mem = item['hardware_info']['post_loading']['gpu_info']['total_available_memory_gb'], item['hardware_info']['post_loading']['gpu_info']['total_occupied_memory_gb']
    avl_mem, total_mem = item['hardware_info']['post_loading']['ram_info']['total_available_ram_gb'], item['hardware_info']['post_loading']['ram_info']['total_ram_gb']
    df['total_gpu_mem_gb'].append(avl_gpu_mem + occupied_gpu_mem)
    df['occupied_gpu_mem_gb'].append(occupied_gpu_mem)
    df['total_mem_gb'].append(total_mem)
    df['occupied_mem_gb'].append(total_mem - avl_mem)

    df['model_device'].append(item.get('model_device'))
    df['model_dtype'].append(item.get('model_dtype'))
    df['model_loading_time'].append(item.get("model_loading_time"))


df = pd.DataFrame(df)
# dict_to_interpretable_string(inf_time_tests[0]['model_loading_params'].pop('gpu', N))


pretrained_model_name_or_path: '/kaggle/input/ehartford-dolphin-2-1-mistral-7b', low_cpu_mem_usage: True, torch_dtype: 'torch.bfloat16'
1.1706524221265413


pretrained_model_name_or_path: '/kaggle/input/ehartford-dolphin-2-1-mistral-7b', torch_dtype: 'torch.bfloat16', device_map: 'auto'
13.495654390917164


pretrained_model_name_or_path: 'TheBloke/dolphin-2.1-mistral-7B-AWQ', device_map: 'cuda:0', trust_remote_code: False, torch_dtype: 'torch.bfloat16'
17.252136787429297


pretrained_model_name_or_path: '/kaggle/input/ehartford-dolphin-2-1-mistral-7b', torch_dtype: 'torch.bfloat16', device_map: 'cuda:0'
18.116759082387627


pretrained_model_name_or_path: 'TheBloke/dolphin-2.1-mistral-7B-AWQ', device_map: 'cuda:0', trust_remote_code: False, torch_dtype: 'torch.float16'
20.071419678895033


pretrained_model_name_or_path: 'TheBloke/dolphin-2.1-mistral-7B-AWQ', device_map: 'cuda:0', revision: 'refs/pr/1'
18.905335754174427


pretrained_model_name_or_path: 'TheBloke/dolphin-2.2.1-mistral-7B

In [3]:
df.sort_values(by = 'avg_token_per_sec', ascending = False)

Unnamed: 0,model_loading_params,model_checkpoint,gpu,model_dtype,model_device,avg_token_per_sec,model_loading_time,total_gpu_mem_gb,occupied_gpu_mem_gb,total_mem_gb,occupied_mem_gb
11,pretrained_model_name_or_path: '/kaggle/input/...,/kaggle/input/thebloke-dolphin-2-2-1-mistral-7...,t4_2x,,cuda:0,44.280489,3675.498247,30,10,32,4
13,pretrained_model_name_or_path: '/kaggle/input/...,/kaggle/input/ls-dolphin-2-2-1-mistral-7b-4-0b...,t4_2x,,cuda:0,37.620098,56271.10672,30,5,32,3
4,pretrained_model_name_or_path: 'TheBloke/dolph...,TheBloke/dolphin-2.1-mistral-7B-AWQ,t4_2x,torch.float16,cuda:0,20.07142,38718.880415,30,6,32,5
12,model_name_or_path: 'TheBloke/dolphin-2.2.1-mi...,TheBloke/dolphin-2.2.1-mistral-7B-GPTQ,t4_2x,torch.float16,cuda:0,19.144249,8424.47114,30,19,32,14
5,pretrained_model_name_or_path: 'TheBloke/dolph...,TheBloke/dolphin-2.1-mistral-7B-AWQ,t4_2x,torch.float16,cuda:0,18.905336,38744.750738,30,6,32,4
3,pretrained_model_name_or_path: '/kaggle/input/...,/kaggle/input/ehartford-dolphin-2-1-mistral-7b,p100,torch.bfloat16,cuda:0,18.116759,113963.797092,16,16,32,5
9,model_name_or_path: 'TheBloke/dolphin-2.2.1-mi...,TheBloke/dolphin-2.2.1-mistral-7B-GPTQ,t4_2x,torch.float16,cuda:0,17.524174,7646.041632,30,16,32,10
10,model_name_or_path: 'TheBloke/dolphin-2.2.1-mi...,TheBloke/dolphin-2.2.1-mistral-7B-GPTQ,t4_2x,torch.float16,cuda:0,17.322899,8991.247892,30,16,32,12
2,pretrained_model_name_or_path: 'TheBloke/dolph...,TheBloke/dolphin-2.1-mistral-7B-AWQ,t4_2x,torch.bfloat16,cuda:0,17.252137,56350.338459,30,6,32,5
7,pretrained_model_name_or_path: 'TheBloke/dolph...,TheBloke/dolphin-2.2.1-mistral-7B-GPTQ,t4_2x,torch.float16,cuda:0,16.090855,17687.150002,30,9,32,5


In [4]:
df.sort_values(by = 'avg_token_per_sec', ascending = False).iloc[0]['model_checkpoint']

'/kaggle/input/thebloke-dolphin-2-2-1-mistral-7b-gptq'