In [1]:
import json
import pandas as pd

In [2]:
with open('inference_time_tests.json', 'r') as file:
    inf_time_tests = json.load(file)

def dict_to_interpretable_string(d):
    parts = []
    for key, value in d.items():
        # Convert the value to a string
        value_str = str(value)
        # Add quotes around string values for clarity
        if isinstance(value, str):
            value_str = f"'{value_str}'"
        # Append the formatted key-value pair to the parts list
        parts.append(f"{key}: {value_str}")
    # Join all parts into a single string
    return ', '.join(parts)

df = {
    'model_loading_params' : [],
    'model_checkpoint' : [],
    'gpu' : [],
    'model_dtype' : [],
    'model_device' : [],
    'avg_token_per_sec' : [],
    'model_loading_time' : [],
    'total_gpu_mem_gb' : [],
    'occupied_gpu_mem_gb' : [],
    'total_mem_gb' : [],
    'occupied_mem_gb' : [],
}

for item in inf_time_tests:
    gpu = item['model_loading_params'].get('gpu', 't4_2x')
    item['model_loading_params'].pop('gpu', None)
    model_loading_params_str = dict_to_interpretable_string(item['model_loading_params'])

    avg_tokens_per_sec = 0
    for sample_metric in item['sample_metrics']:
        avg_tokens_per_sec += sample_metric['output_tokens'] / (sample_metric['gen_latency'] / 1000)
    avg_tokens_per_sec /= len(item['sample_metrics'])

    print(model_loading_params_str)
    print(avg_tokens_per_sec)

    print('\n')

    df['model_loading_params'].append(model_loading_params_str)
    df['model_checkpoint'].append(item['model_loading_params'].get('pretrained_model_name_or_path', item['model_loading_params'].get('model_name_or_path')))
    df['gpu'].append(gpu)
    df['avg_token_per_sec'].append(avg_tokens_per_sec)
    avl_gpu_mem, occupied_gpu_mem = item['hardware_info']['post_loading']['gpu_info']['total_available_memory_gb'], item['hardware_info']['post_loading']['gpu_info']['total_occupied_memory_gb']
    avl_mem, total_mem = item['hardware_info']['post_loading']['ram_info']['total_available_ram_gb'], item['hardware_info']['post_loading']['ram_info']['total_ram_gb']
    df['total_gpu_mem_gb'].append(avl_gpu_mem + occupied_gpu_mem)
    df['occupied_gpu_mem_gb'].append(occupied_gpu_mem)
    df['total_mem_gb'].append(total_mem)
    df['occupied_mem_gb'].append(total_mem - avl_mem)

    df['model_device'].append(item.get('model_device'))
    df['model_dtype'].append(item.get('model_dtype'))
    df['model_loading_time'].append(item.get("model_loading_time"))


df = pd.DataFrame(df)
# dict_to_interpretable_string(inf_time_tests[0]['model_loading_params'].pop('gpu', N))


pretrained_model_name_or_path: '/kaggle/input/ehartford-dolphin-2-1-mistral-7b', low_cpu_mem_usage: True, torch_dtype: 'torch.bfloat16'
1.1706524221265413


pretrained_model_name_or_path: '/kaggle/input/ehartford-dolphin-2-1-mistral-7b', torch_dtype: 'torch.bfloat16', device_map: 'auto'
13.495654390917164


pretrained_model_name_or_path: 'TheBloke/dolphin-2.1-mistral-7B-AWQ', device_map: 'cuda:0', trust_remote_code: False, torch_dtype: 'torch.bfloat16'
17.252136787429297


pretrained_model_name_or_path: '/kaggle/input/ehartford-dolphin-2-1-mistral-7b', torch_dtype: 'torch.bfloat16', device_map: 'cuda:0'
18.116759082387627


pretrained_model_name_or_path: 'TheBloke/dolphin-2.1-mistral-7B-AWQ', device_map: 'cuda:0', trust_remote_code: False, torch_dtype: 'torch.float16'
20.071419678895033


pretrained_model_name_or_path: 'TheBloke/dolphin-2.1-mistral-7B-AWQ', device_map: 'cuda:0', revision: 'refs/pr/1'
18.905335754174427


pretrained_model_name_or_path: 'TheBloke/dolphin-2.2.1-mistral-7B

In [5]:
sorted_df = df.sort_values(by = 'avg_token_per_sec', ascending = False)
for idx in sorted_df.index:
    sample = sorted_df.loc[idx]

    print(f"tps - {sample.avg_token_per_sec}")
    print(f"model checkpoint - {sample.model_checkpoint}")
    print(f"model_loading_params - {sample.model_loading_params}")
    print(f"model dtype - {sample.model_dtype}")

    print('\n\n')



tps - 44.28048877512073
model checkpoint - /kaggle/input/thebloke-dolphin-2-2-1-mistral-7b-gptq
model_loading_params - pretrained_model_name_or_path: '/kaggle/input/thebloke-dolphin-2-2-1-mistral-7b-gptq'
model dtype - None



tps - 37.620098225627714
model checkpoint - /kaggle/input/ls-dolphin-2-2-1-mistral-7b-4-0bpw-h6-exl2
model_loading_params - pretrained_model_name_or_path: '/kaggle/input/ls-dolphin-2-2-1-mistral-7b-4-0bpw-h6-exl2'
model dtype - None



tps - 20.071419678895033
model checkpoint - TheBloke/dolphin-2.1-mistral-7B-AWQ
model_loading_params - pretrained_model_name_or_path: 'TheBloke/dolphin-2.1-mistral-7B-AWQ', device_map: 'cuda:0', trust_remote_code: False, torch_dtype: 'torch.float16'
model dtype - torch.float16



tps - 19.14424932230798
model checkpoint - TheBloke/dolphin-2.2.1-mistral-7B-GPTQ
model_loading_params - model_name_or_path: 'TheBloke/dolphin-2.2.1-mistral-7B-GPTQ', use_cuda_fp16: False, use_safetensors: True, inject_fused_attention: True, inject_fused_m

In [4]:
df.sort_values(by = 'avg_token_per_sec', ascending = False).iloc[0]['model_checkpoint']

'/kaggle/input/thebloke-dolphin-2-2-1-mistral-7b-gptq'

In [9]:
df.sort_values(by = 'avg_token_per_sec', ascending = False).iloc[0]['model_loading_params']

"pretrained_model_name_or_path: '/kaggle/input/thebloke-dolphin-2-2-1-mistral-7b-gptq'"