<a href="https://colab.research.google.com/github/ShaswataJash/PytorchVisionModelComparision/blob/main/Pytorch_vision_model_comparision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download required python dependency (torchvision and torchinfo)

In [None]:
!pip install torchvision==0.14.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
!pip install torchinfo==1.7.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchinfo==1.7.1
  Downloading torchinfo-1.7.1-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.1


In [2]:
import torchvision
from torchvision.models import *
from torchinfo import summary
from tqdm import tqdm
import pandas as pd
import pprint
import numpy as np
import traceback

In [3]:
def my_import(name): #ref: https://stackoverflow.com/questions/547829/how-to-dynamically-load-a-python-class
    components = name.split('.')
    mod = __import__(components[0])
    for comp in components[1:]:
        mod = getattr(mod, comp)
    return mod

# Vision model comparision

The main logic is to compare models according to 
* 5% accuracy
* Training time (for that number of parameters to find and multiplications and additions involved in the model are the representative metric)
* memory required in CPU or GPU for a batch. For batch, we assume the size as (2, 3, [recommended crop width for the model], [recommended crop height for the model] if square image size is not explicitely mentioned) 

The higher the accuracy the better is the model. The lower the training time required, the better is the model. The lower the memory required for the batch, the better is the model.

In [4]:
classification_models = list_models(module=torchvision.models)
pprint.pprint(classification_models)

['alexnet',
 'convnext_base',
 'convnext_large',
 'convnext_small',
 'convnext_tiny',
 'densenet121',
 'densenet161',
 'densenet169',
 'densenet201',
 'efficientnet_b0',
 'efficientnet_b1',
 'efficientnet_b2',
 'efficientnet_b3',
 'efficientnet_b4',
 'efficientnet_b5',
 'efficientnet_b6',
 'efficientnet_b7',
 'efficientnet_v2_l',
 'efficientnet_v2_m',
 'efficientnet_v2_s',
 'googlenet',
 'inception_v3',
 'maxvit_t',
 'mnasnet0_5',
 'mnasnet0_75',
 'mnasnet1_0',
 'mnasnet1_3',
 'mobilenet_v2',
 'mobilenet_v3_large',
 'mobilenet_v3_small',
 'regnet_x_16gf',
 'regnet_x_1_6gf',
 'regnet_x_32gf',
 'regnet_x_3_2gf',
 'regnet_x_400mf',
 'regnet_x_800mf',
 'regnet_x_8gf',
 'regnet_y_128gf',
 'regnet_y_16gf',
 'regnet_y_1_6gf',
 'regnet_y_32gf',
 'regnet_y_3_2gf',
 'regnet_y_400mf',
 'regnet_y_800mf',
 'regnet_y_8gf',
 'resnet101',
 'resnet152',
 'resnet18',
 'resnet34',
 'resnet50',
 'resnext101_32x8d',
 'resnext101_64x4d',
 'resnext50_32x4d',
 'shufflenet_v2_x0_5',
 'shufflenet_v2_x1_0',
 'sh

In [5]:
square_image_size = -1 #@param {type:"integer"}
batch_size = 2 #@param {type:"integer"}
assert batch_size > 1

In [None]:
model_name = []
accuracy_1 = []
accuracy_5 = []
best_weight_name = []
crop_for_best_accu = []
total_mul_add = []
total_gpu_mem_mb_reqd = []
param_count_calculated = []
param_count_from_meta = []
image_size_variant = []

for c in tqdm(classification_models):
   
    model_wt = get_model_weights(c)
    print(c, type(model_wt), model_wt.__name__)

    crop_size_corr_to_best_acc = None
    weight_name_corr_to_best_acc = None
    accu1_corr_to_best_acc = None
    param_count_corr_to_best_acc = None
    best_acc_5 = 0.0
    for wts in model_wt:
        print(wts.name, wts.value)
        print(wts.name, wts.value.transforms)
        print(wts.value.transforms.keywords['crop_size'])
        print(wts.value.meta['_metrics']['ImageNet-1K']['acc@1'])
        print(wts.value.meta['_metrics']['ImageNet-1K']['acc@5'])
        if wts.value.meta['_metrics']['ImageNet-1K']['acc@5'] > best_acc_5:
            best_acc_5 = wts.value.meta['_metrics']['ImageNet-1K']['acc@5']
            crop_size_corr_to_best_acc = wts.value.transforms.keywords['crop_size']
            weight_name_corr_to_best_acc = model_wt.__name__ + '.' + wts.name
            accu1_corr_to_best_acc = wts.value.meta['_metrics']['ImageNet-1K']['acc@1']
            param_count_corr_to_best_acc = wts.value.meta['num_params']

    model_name.append(c)
    accuracy_5.append(best_acc_5)
    accuracy_1.append(accu1_corr_to_best_acc)
    best_weight_name.append(weight_name_corr_to_best_acc)
    crop_for_best_accu.append(crop_size_corr_to_best_acc)
    param_count_from_meta.append(param_count_corr_to_best_acc)

    wt_enum = my_import('torchvision.models.'+weight_name_corr_to_best_acc)
    m = get_model(c, weights=wt_enum)
    my_sq_image_size = crop_size_corr_to_best_acc if square_image_size < 0 else square_image_size
    print(c, type(m), type(wt_enum), wt_enum.name, my_sq_image_size)
    try:
        sum_obj = summary(m, 
            input_size=(batch_size, 3,my_sq_image_size,my_sq_image_size), #batch size = 2 (keep it more than 1 or else error)
            col_width=15,
            depth=8,
            mode='train',
            col_names=["input_size", "output_size", "num_params", "kernel_size"])
        gpu_bytes_required = sum_obj.to_megabytes(sum_obj.total_input + sum_obj.total_output_bytes + sum_obj.total_param_bytes)
        print(sum_obj.total_mult_adds,  gpu_bytes_required)
        
        total_mul_add.append(sum_obj.total_mult_adds)
        total_gpu_mem_mb_reqd.append(gpu_bytes_required)
        default_parameter_count = sum_obj.total_params
        param_count_calculated.append(sum_obj.total_params)
        del sum_obj

    except:
        print('failure observed for model', c)
        print(traceback.format_exc())

        total_mul_add.append(np.nan)
        total_gpu_mem_mb_reqd.append(np.nan)
        param_count_calculated.append(np.nan)
        image_size_variant.append(np.nan)
        print('\n===============================================================\n', flush=True)
        continue


    non_standard_w = my_sq_image_size*2
    non_standard_h = (int)(my_sq_image_size/2)

    try :
        #now let us check if param_count is changing if we change the size of the image
        sum_obj_non_default = summary(m, 
            input_size=(batch_size, 3,non_standard_w,non_standard_h), #batch size = 2 (keep it more than 1 or else error)
            col_width=15,
            depth=8,
            mode='train',
            col_names=["input_size", "output_size", "num_params", "kernel_size"])
        if sum_obj_non_default.total_params != default_parameter_count:
            print("MODEL ", c , "WILL NOT WORK FOR DIFFERENT IMAGE SIZE FOR TRANSFER LEARNING")
            image_size_variant.append(1) #actually code does not enter into this block as we will get always exception due to size check assertion within pytorch lib
        else:
            image_size_variant.append(0)
        del sum_obj_non_default
        
    except:
        print('non-standard image size not supported for model', c, 'non-standard w=', non_standard_w, 'non_standard h=', non_standard_h)
        print(traceback.format_exc())
        image_size_variant.append(1)

    print('\n===============================================================\n', flush=True)
    del model_wt
    del m

model_comparision_data = {'model': model_name, 'weight': best_weight_name, 
                          'accuracy_5': accuracy_5, 'accuracy_1': accuracy_1, 
                          'total_mul_add': total_mul_add, 'param_meta':param_count_from_meta,
                          'param_calcu': param_count_calculated,
                          'gpu_mem_reqd': total_gpu_mem_mb_reqd, 'crop_size': crop_for_best_accu, 
                          'image_size_variant': image_size_variant}
comp_df = pd.DataFrame.from_dict(model_comparision_data)

In [7]:
pd.set_option('display.max_rows', comp_df.shape[0]+1)

In [8]:
#check which rows have atleast one na
comp_df[comp_df.isna().any(axis=1)]

Unnamed: 0,model,weight,accuracy_5,accuracy_1,total_mul_add,param_meta,param_calcu,gpu_mem_reqd,crop_size,image_size_variant


In [None]:
comp_df.sort_values(by=['accuracy_5'], ascending=False)

In [None]:
comp_df_filtered = comp_df.astype({'param_meta':'int', 'total_mul_add':'int', 'param_calcu': 'int', 'image_size_variant': 'int8'})
comp_df_filtered

We have to normalize the values for different metrics to effectively compare different models in the same scale.

In [None]:
df_z_scaled = comp_df_filtered.copy()

df_z_scaled['z_accuracy_5'] = (df_z_scaled['accuracy_5'] - df_z_scaled['accuracy_5'].min()) / (df_z_scaled['accuracy_5'].max() - df_z_scaled['accuracy_5'].min())
df_z_scaled['z_total_mul_add'] = (df_z_scaled['total_mul_add'] - df_z_scaled['total_mul_add'].min()) / (df_z_scaled['total_mul_add'].max() - df_z_scaled['total_mul_add'].min())
df_z_scaled['z_param_meta'] = (df_z_scaled['param_meta'] - df_z_scaled['param_meta'].min()) / (df_z_scaled['param_meta'].max() - df_z_scaled['param_meta'].min())
df_z_scaled['z_gpu_mem_reqd'] = (df_z_scaled['gpu_mem_reqd'] - df_z_scaled['gpu_mem_reqd'].min()) / (df_z_scaled['gpu_mem_reqd'].max() - df_z_scaled['gpu_mem_reqd'].min())
df_z_scaled['inverse_z_crop_size'] = 1.0 / ((df_z_scaled['crop_size'] - df_z_scaled['crop_size'].min()) / (df_z_scaled['crop_size'].max() - df_z_scaled['crop_size'].min()) + 1e-3) #adding a small number 1e-3 to avoid divide by zero
df_z_scaled

We can configure relative weights for each of the metric that will contribute in overall score against each model. Following pointers to be noted - 
* z_accuracy_5 : minmax-normalized metric corresponding to accuracy score of the model at 5% level
* z_param_meta : minmax-normalized metric corresponding to total number of parameters to be trained (the higher is the parameter count, the slower will be to train the model)
* z_gpu_mem_reqd: minmax-normalized metric corresponding to amount of gpu memory required for the batch_size and image_size (wxh) combination
* z_total_mul_add: minmax-normalized metric corresponding to number of maltiplication and additions are involved while training the model (the higher is the multiplication-additions count, the slower will be to train the model. Although, its efect is much more subtle compared to effect of number of trainable parameter count.)
* inverse_z_crop_size: minmax-normalized metric corresponding to approved input image size. However, it's effect will be applicable only if the model's trainable parameter count varies according to input image size. In other words, where ['image_size_variant'] = 0, this metric has no effect because transfer learning can be done easily irrespective of input size. On the other hand, where input image size has to be kept fixed in that case if transfer learning has to be done then input imaze has to be first resized to the approved image size for the model. Further, the bigger the input image size supported by the model, the better it is as bigger image can help in extracting more features for the model. Thus, we take inverse of the normalized value while considering this 

From the above mentioned metrics, only z_accuracy_5 has +ve contribution to the overall score, all other will reduce the overall score as they are determintal w.r.t a model training (slowness, large hardware requirement in terms of GPU ram, no ability to use transfer learning for arbitary image size etc.)

In [22]:
accuracy_weight = 0.6 #@param {type:"number"}
param_meta_weight = 0.3 #@param {type:"number"}
gpu_mem_weight = 0.05 #@param {type:"number"}
total_mul_add_weight = 0.049975 #@param {type:"number"}
image_size_invariant_weight = 0.000025 #@param {type:"number"}
assert (accuracy_weight + param_meta_weight + gpu_mem_weight + total_mul_add_weight + image_size_invariant_weight) == 1.0


In [23]:
df_z_scaled['score'] = (accuracy_weight * df_z_scaled['z_accuracy_5']) \
     - (param_meta_weight * df_z_scaled['z_param_meta']) \
     - (gpu_mem_weight * df_z_scaled['z_gpu_mem_reqd']) \
     - (total_mul_add_weight * df_z_scaled['z_total_mul_add']) \
     - (df_z_scaled['image_size_variant'] * image_size_invariant_weight * df_z_scaled['inverse_z_crop_size'])

df_z_scaled.sort_values(by=['score'], ascending=False)

Unnamed: 0,model,weight,accuracy_5,accuracy_1,total_mul_add,param_meta,param_calcu,gpu_mem_reqd,crop_size,image_size_variant,z_accuracy_5,z_total_mul_add,z_param_meta,z_gpu_mem_reqd,inverse_z_crop_size,score
19,efficientnet_v2_s,EfficientNet_V2_S_Weights.IMAGENET1K_V1,96.878,84.228,16731614432,21458488,21458488,1233.32584,384,0,0.907479,0.022245,0.031423,0.12552,2.34449,0.527673
73,vit_b_16,ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1,97.65,85.304,795315152,86859496,86859496,845.566544,384,1,0.946811,0.00097,0.133044,0.084752,2.34449,0.523829
38,regnet_y_16gf,RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_E2E_V1,98.054,86.012,93470087480,83590140,83590140,2502.860256,384,0,0.967394,0.124689,0.127964,0.258994,2.34449,0.522866
22,maxvit_t,MaxVit_T_Weights.IMAGENET1K_V1,96.722,83.7,4027148096,30919624,30919624,904.518704,224,0,0.899531,0.005284,0.046124,0.09095,1000.0,0.52107
13,efficientnet_b4,EfficientNet_B4_Weights.IMAGENET1K_V1,96.594,83.384,8787852528,19341616,19341616,1664.000496,380,0,0.89301,0.01164,0.028134,0.170799,2.404461,0.518244
63,swin_v2_s,Swin_V2_S_Weights.IMAGENET1K_V1,96.816,83.712,106338512,49737442,49737442,525.893792,256,0,0.90432,5e-05,0.075363,0.051143,11.613541,0.517424
3,convnext_small,ConvNeXt_Small_Weights.IMAGENET1K_V1,96.65,83.616,822782480,50223688,50223688,616.326224,224,0,0.895863,0.001007,0.076119,0.06065,1000.0,0.511599
12,efficientnet_b3,EfficientNet_B3_Weights.IMAGENET1K_V1,96.054,82.008,3654499536,12233232,12233232,813.357168,300,0,0.865498,0.004787,0.017088,0.081366,4.923012,0.509865
18,efficientnet_v2_m,EfficientNet_V2_M_Weights.IMAGENET1K_V1,97.156,85.112,49165642472,54139356,54139356,3095.815392,480,0,0.921643,0.065544,0.082203,0.321335,1.466596,0.508982
4,convnext_tiny,ConvNeXt_Tiny_Weights.IMAGENET1K_V1,96.146,82.52,644743184,28589128,28589128,378.083408,224,0,0.870185,0.000769,0.042502,0.035602,1000.0,0.507542
