# CLIP Model Evaluation

Models evaluated include:<br>
1. <b>Anti-Expert Model :</b> 
    - Image Embeddings - Resnet18
    - Text Embedding - Uncased BERT
1. <b>Expert Model :</b> 
    - Image Embeddings - Resnet18
    - Text Embedding - Uncased BERT
1. <b>Base Target Model :</b> 
    - Image Embeddings - Resnet50
    - Text Embedding - Uncased BERT
1. <b>Target Model :</b> 
    - Image Embeddings - Resnet50
    - Text Embedding - Uncased BERT

## Loading Libs and Dataset

In [44]:
from utils import make_train_valid_dfs, build_loaders
from model import CLIPModel
from config import TrainingCFG, TextEncCFG

from transformers import DistilBertTokenizer
import torch
import torch.nn.functional as F
from tqdm import tqdm

import pandas as pd

In [45]:
model_dir = 'models/'

In [46]:
_, _, test_df = make_train_valid_dfs()
test_df.head()

Unnamed: 0.1,Unnamed: 0,Filename,Label,ClassName
0,2438,AnnualCrop/AnnualCrop_1275.jpg,0,AnnualCrop
1,1018,Residential/Residential_504.jpg,7,Residential
2,3,Pasture/Pasture_787.jpg,5,Pasture
3,1011,Residential/Residential_1575.jpg,7,Residential
4,47,Pasture/Pasture_473.jpg,5,Pasture


### Text Embeddings from CLIP Projection Layer

In [47]:
def get_text_embeddings(df, model):
    tokenizer = DistilBertTokenizer.from_pretrained(TextEncCFG.tokenizer)
    
    unique_captions = df['ClassName'].unique()
    arranged_captions = ['' for i in unique_captions]
    
    for ele in unique_captions:
        id = list(df['Label'][df.ClassName == ele])[0]
        arranged_captions[id] = ele

    encoded_captions = tokenizer(
                list(arranged_captions),
                padding=True,
                truncation=True,
                max_length=TextEncCFG.max_length
            )
    print(encoded_captions)
    final_caption_embeddings = []
    for idx in range(len(encoded_captions["input_ids"])):
        input_ids=torch.Tensor(encoded_captions["input_ids"][idx]).to(TrainingCFG.device)
        attention_mask = torch.Tensor(encoded_captions["attention_mask"][idx]).to(TrainingCFG.device)
        input_ids = input_ids.view(1, -1).to(torch.long)
        attention_mask = attention_mask.view(1, -1).to(torch.long)

        text_features = model.text_encoder(
                        input_ids=input_ids,
                        attention_mask=attention_mask
                    )
        text_embeddings = model.text_projection(text_features)
        final_caption_embeddings.append(text_embeddings)
        
    return torch.cat(final_caption_embeddings), arranged_captions

### Image Embedding from CLIP Projection Layer

In [48]:
def get_image_embeddings(df, model):
    tokenizer = DistilBertTokenizer.from_pretrained(TextEncCFG.tokenizer)
    valid_loader = build_loaders(df, tokenizer, mode="valid")
    
    # model = CLIPModel().to(TrainingCFG.device)
    # model.load_state_dict(torch.load(model_path, map_location=TrainingCFG.device))
    # model.eval()
    
    valid_image_embeddings = []
    with torch.no_grad():
        for batch in tqdm(valid_loader):
            image_features = model.image_encoder(batch["image"].to(
                                                        TrainingCFG.device))
            image_embeddings = model.image_projection(image_features)
            valid_image_embeddings.append(image_embeddings)
    return torch.cat(valid_image_embeddings)

## 1. Anti Expert Model Evaluation

In [49]:
model_name = "anti-expert_resnet18.pt"

In [50]:
tokenizer = DistilBertTokenizer.from_pretrained(TextEncCFG.tokenizer)
valid_loader = build_loaders(test_df, tokenizer, mode="valid")
model = CLIPModel().to(TrainingCFG.device)
model.load_state_dict(torch.load(model_dir+model_name, map_location=TrainingCFG.device))
model.eval()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CLIPModel(
  (image_encoder): ImageEncoder(
    (model): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (drop_block): Identity()
          (act1): ReLU(inplace=True)
          (aa): Identity()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act2): ReLU(inplace=True)
        )
        (1): BasicBlock(
          (co

In [51]:
caption_embeddings, captions = get_text_embeddings(test_df, model)
caption_embeddings.shape

{'input_ids': [[101, 3296, 26775, 7361, 102, 0, 0], [101, 3224, 102, 0, 0, 0, 0], [101, 12810, 25560, 3726, 18150, 3370, 102], [101, 3307, 102, 0, 0, 0, 0], [101, 3919, 102, 0, 0, 0, 0], [101, 20787, 102, 0, 0, 0, 0], [101, 4568, 26775, 7361, 102, 0, 0], [101, 5647, 102, 0, 0, 0, 0], [101, 2314, 102, 0, 0, 0, 0], [101, 7744, 13808, 102, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0]]}


torch.Size([10, 256])

In [52]:
image_embeddings =  get_image_embeddings(test_df, model)
image_embeddings.shape

100%|██████████| 85/85 [00:05<00:00, 14.66it/s]


torch.Size([2700, 256])

In [53]:
# Fusing the layers to gether 
image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1)
text_embeddings_n = F.normalize(caption_embeddings, p=2, dim=-1)
dot_similarity = image_embeddings_n @ text_embeddings_n.T
dot_similarity.shape

torch.Size([2700, 10])

In [54]:
# Logits based prediction
preds = F.softmax(dot_similarity, dim =1)
pred_classes = torch.argmax(preds, axis=1)
print(pred_classes)
y_preds = pred_classes.tolist()

tensor([7, 2, 7,  ..., 2, 2, 2], device='cuda:4')


In [55]:
from sklearn.metrics import classification_report

print(classification_report(test_df['Label'], y_preds, target_names=captions))

                      precision    recall  f1-score   support

          AnnualCrop       0.00      0.00      0.00       300
              Forest       0.00      0.00      0.00       300
HerbaceousVegetation       0.08      0.56      0.14       300
             Highway       0.40      0.01      0.02       250
          Industrial       0.00      0.00      0.00       250
             Pasture       0.00      0.00      0.00       200
       PermanentCrop       0.00      0.00      0.00       250
         Residential       0.02      0.03      0.02       300
               River       0.00      0.00      0.00       250
             SeaLake       0.03      0.00      0.01       300

            accuracy                           0.07      2700
           macro avg       0.05      0.06      0.02      2700
        weighted avg       0.05      0.07      0.02      2700



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 2. Expert Model Evaluation

In [57]:
model_name = "tuned_CLIP_resnet18_8.pt"

In [58]:
tokenizer = DistilBertTokenizer.from_pretrained(TextEncCFG.tokenizer)
valid_loader = build_loaders(test_df, tokenizer, mode="valid")
model = CLIPModel().to(TrainingCFG.device)
model.load_state_dict(torch.load(model_dir+model_name, map_location=TrainingCFG.device))
model.eval()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CLIPModel(
  (image_encoder): ImageEncoder(
    (model): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (drop_block): Identity()
          (act1): ReLU(inplace=True)
          (aa): Identity()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act2): ReLU(inplace=True)
        )
        (1): BasicBlock(
          (co

In [59]:
caption_embeddings, captions = get_text_embeddings(test_df, model)
caption_embeddings.shape

{'input_ids': [[101, 3296, 26775, 7361, 102, 0, 0], [101, 3224, 102, 0, 0, 0, 0], [101, 12810, 25560, 3726, 18150, 3370, 102], [101, 3307, 102, 0, 0, 0, 0], [101, 3919, 102, 0, 0, 0, 0], [101, 20787, 102, 0, 0, 0, 0], [101, 4568, 26775, 7361, 102, 0, 0], [101, 5647, 102, 0, 0, 0, 0], [101, 2314, 102, 0, 0, 0, 0], [101, 7744, 13808, 102, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0]]}


torch.Size([10, 256])

In [66]:
image_embeddings =  get_image_embeddings(test_df, model)
image_embeddings.shape

100%|██████████| 85/85 [00:04<00:00, 18.78it/s]


torch.Size([2700, 256])

In [67]:
# Fusing the layers to gether 
image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1)
text_embeddings_n = F.normalize(caption_embeddings, p=2, dim=-1)
dot_similarity = image_embeddings_n @ text_embeddings_n.T
dot_similarity.shape

torch.Size([2700, 10])

In [68]:
# Logits based prediction
preds = F.softmax(dot_similarity, dim =1)
pred_classes = torch.argmax(preds, axis=1)
print(pred_classes)
y_preds = pred_classes.tolist()

tensor([0, 7, 5,  ..., 9, 8, 0], device='cuda:4')


In [69]:
from sklearn.metrics import classification_report

print(classification_report(test_df['Label'], y_preds, target_names=captions))

                      precision    recall  f1-score   support

          AnnualCrop       1.00      0.96      0.98       300
              Forest       0.92      1.00      0.96       300
HerbaceousVegetation       1.00      0.82      0.90       300
             Highway       0.97      0.96      0.97       250
          Industrial       0.98      1.00      0.99       250
             Pasture       0.95      0.99      0.97       200
       PermanentCrop       0.88      1.00      0.93       250
         Residential       0.99      1.00      1.00       300
               River       0.98      0.96      0.97       250
             SeaLake       1.00      0.99      1.00       300

            accuracy                           0.97      2700
           macro avg       0.97      0.97      0.97      2700
        weighted avg       0.97      0.97      0.97      2700



## 3. Base Target Model

In [77]:
model_name = "base_target_model_resnet50.pt"

In [78]:
tokenizer = DistilBertTokenizer.from_pretrained(TextEncCFG.tokenizer)
valid_loader = build_loaders(test_df, tokenizer, mode="valid")
model = CLIPModel().to(TrainingCFG.device)
model.load_state_dict(torch.load(model_dir+model_name, map_location=TrainingCFG.device))
model.eval()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CLIPModel(
  (image_encoder): ImageEncoder(
    (model): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (drop_block): Identity()
          (act1): ReLU(inplace=True)
          (aa): Identity()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act2): ReLU(inplace=True)
        )
        (1): BasicBlock(
          (co

In [79]:
caption_embeddings, captions = get_text_embeddings(test_df, model)
caption_embeddings.shape

{'input_ids': [[101, 3296, 26775, 7361, 102, 0, 0], [101, 3224, 102, 0, 0, 0, 0], [101, 12810, 25560, 3726, 18150, 3370, 102], [101, 3307, 102, 0, 0, 0, 0], [101, 3919, 102, 0, 0, 0, 0], [101, 20787, 102, 0, 0, 0, 0], [101, 4568, 26775, 7361, 102, 0, 0], [101, 5647, 102, 0, 0, 0, 0], [101, 2314, 102, 0, 0, 0, 0], [101, 7744, 13808, 102, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0]]}


torch.Size([10, 256])

In [80]:
image_embeddings =  get_image_embeddings(test_df, model)
image_embeddings.shape

100%|██████████| 85/85 [00:09<00:00,  8.75it/s]


torch.Size([2700, 256])

In [81]:
# Fusing the layers to gether 
image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1)
text_embeddings_n = F.normalize(caption_embeddings, p=2, dim=-1)
dot_similarity = image_embeddings_n @ text_embeddings_n.T
dot_similarity.shape

torch.Size([2700, 10])

In [82]:
# Logits based prediction
preds = F.softmax(dot_similarity, dim =1)
pred_classes = torch.argmax(preds, axis=1)
print(pred_classes)
y_preds = pred_classes.tolist()

tensor([7, 6, 4,  ..., 9, 4, 7], device='cuda:4')


In [83]:
from sklearn.metrics import classification_report

print(classification_report(test_df['Label'], y_preds, target_names=captions))

                      precision    recall  f1-score   support

          AnnualCrop       0.00      0.00      0.00       300
              Forest       0.22      0.07      0.11       300
HerbaceousVegetation       0.00      0.00      0.00       300
             Highway       0.04      0.04      0.04       250
          Industrial       0.29      0.69      0.41       250
             Pasture       0.00      0.00      0.00       200
       PermanentCrop       0.08      0.05      0.06       250
         Residential       0.07      0.19      0.10       300
               River       0.00      0.00      0.00       250
             SeaLake       0.42      0.86      0.56       300

            accuracy                           0.20      2700
           macro avg       0.11      0.19      0.13      2700
        weighted avg       0.12      0.20      0.13      2700



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



## 4. Tuned Target Model

In [84]:
model_name = "tuned_CLIP_resnet50_8.pt"

In [85]:
tokenizer = DistilBertTokenizer.from_pretrained(TextEncCFG.tokenizer)
valid_loader = build_loaders(test_df, tokenizer, mode="valid")
model = CLIPModel().to(TrainingCFG.device)
model.load_state_dict(torch.load(model_dir+model_name, map_location=TrainingCFG.device))
model.eval()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CLIPModel(
  (image_encoder): ImageEncoder(
    (model): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (drop_block): Identity()
          (act1): ReLU(inplace=True)
          (aa): Identity()
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act2): ReLU(inplace=True)
        )
        (1): BasicBlock(
          (co

In [86]:
caption_embeddings, captions = get_text_embeddings(test_df, model)
caption_embeddings.shape

{'input_ids': [[101, 3296, 26775, 7361, 102, 0, 0], [101, 3224, 102, 0, 0, 0, 0], [101, 12810, 25560, 3726, 18150, 3370, 102], [101, 3307, 102, 0, 0, 0, 0], [101, 3919, 102, 0, 0, 0, 0], [101, 20787, 102, 0, 0, 0, 0], [101, 4568, 26775, 7361, 102, 0, 0], [101, 5647, 102, 0, 0, 0, 0], [101, 2314, 102, 0, 0, 0, 0], [101, 7744, 13808, 102, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0]]}


torch.Size([10, 256])

In [87]:
image_embeddings =  get_image_embeddings(test_df, model)
image_embeddings.shape

100%|██████████| 85/85 [00:05<00:00, 14.71it/s]


torch.Size([2700, 256])

In [88]:
# Fusing the layers to gether 
image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1)
text_embeddings_n = F.normalize(caption_embeddings, p=2, dim=-1)
dot_similarity = image_embeddings_n @ text_embeddings_n.T
dot_similarity.shape

torch.Size([2700, 10])

In [89]:
# Logits based prediction
preds = F.softmax(dot_similarity, dim =1)
pred_classes = torch.argmax(preds, axis=1)
print(pred_classes)
y_preds = pred_classes.tolist()

tensor([0, 7, 5,  ..., 9, 8, 0], device='cuda:4')


In [90]:
from sklearn.metrics import classification_report

print(classification_report(test_df['Label'], y_preds, target_names=captions))

                      precision    recall  f1-score   support

          AnnualCrop       0.98      0.99      0.99       300
              Forest       0.98      0.99      0.98       300
HerbaceousVegetation       0.98      0.94      0.96       300
             Highway       0.95      0.98      0.96       250
          Industrial       1.00      0.99      0.99       250
             Pasture       0.97      0.98      0.98       200
       PermanentCrop       0.95      0.98      0.97       250
         Residential       1.00      0.99      0.99       300
               River       0.98      0.95      0.96       250
             SeaLake       1.00      0.99      1.00       300

            accuracy                           0.98      2700
           macro avg       0.98      0.98      0.98      2700
        weighted avg       0.98      0.98      0.98      2700



# Results 

| Model Name | Trained | Classification Accuracy |
|--------|--------|--------|
|Anti-Expert CLIP small| ❌ | 7%|
|Expert CLIP small| ✅ | 97%|
|Base Target CLIP | ❌ | 20%|
|Tuned Target CLIP | ✅ | 98%|

# Proxy Tuning

Adjust the logits of the target model by adding the the difference between the expert and the anti-expert model

In [91]:
_, val_df, test_df = make_train_valid_dfs()

In [92]:
def get_logits(model_name:str, valid_df:pd.DataFrame):
    model = CLIPModel().to(TrainingCFG.device)
    model.load_state_dict(torch.load(model_name, map_location=TrainingCFG.device))
    model.eval()
    caption_embeddings, captions = get_text_embeddings(valid_df, model)
    image_embeddings =  get_image_embeddings(valid_df, model)
    image_embeddings_n = F.normalize(image_embeddings, p=2, dim=-1)
    text_embeddings_n = F.normalize(caption_embeddings, p=2, dim=-1)
    dot_similarity = image_embeddings_n @ text_embeddings_n.T
    return dot_similarity

In [93]:

# for df in [val_df, test_df]:

anti_expert_logits = get_logits(model_dir+'anti-expert_resnet18.pt', test_df)
expert_model_logits = get_logits(model_dir+'tuned_CLIP_resnet18_8.pt', test_df)
target_model_logits = get_logits(model_dir+'base_target_model_resnet50.pt', test_df)

target_model_logits = target_model_logits + (
    expert_model_logits - anti_expert_logits)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': [[101, 3296, 26775, 7361, 102, 0, 0], [101, 3224, 102, 0, 0, 0, 0], [101, 12810, 25560, 3726, 18150, 3370, 102], [101, 3307, 102, 0, 0, 0, 0], [101, 3919, 102, 0, 0, 0, 0], [101, 20787, 102, 0, 0, 0, 0], [101, 4568, 26775, 7361, 102, 0, 0], [101, 5647, 102, 0, 0, 0, 0], [101, 2314, 102, 0, 0, 0, 0], [101, 7744, 13808, 102, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0]]}


100%|██████████| 85/85 [00:10<00:00,  8.12it/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': [[101, 3296, 26775, 7361, 102, 0, 0], [101, 3224, 102, 0, 0, 0, 0], [101, 12810, 25560, 3726, 18150, 3370, 102], [101, 3307, 102, 0, 0, 0, 0], [101, 3919, 102, 0, 0, 0, 0], [101, 20787, 102, 0, 0, 0, 0], [101, 4568, 26775, 7361, 102, 0, 0], [101, 5647, 102, 0, 0, 0, 0], [101, 2314, 102, 0, 0, 0, 0], [101, 7744, 13808, 102, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0]]}


100%|██████████| 85/85 [00:11<00:00,  7.67it/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': [[101, 3296, 26775, 7361, 102, 0, 0], [101, 3224, 102, 0, 0, 0, 0], [101, 12810, 25560, 3726, 18150, 3370, 102], [101, 3307, 102, 0, 0, 0, 0], [101, 3919, 102, 0, 0, 0, 0], [101, 20787, 102, 0, 0, 0, 0], [101, 4568, 26775, 7361, 102, 0, 0], [101, 5647, 102, 0, 0, 0, 0], [101, 2314, 102, 0, 0, 0, 0], [101, 7744, 13808, 102, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0]]}


100%|██████████| 85/85 [00:10<00:00,  7.74it/s]


In [94]:
preds = F.softmax(target_model_logits, dim =1)
pred_classes = torch.argmax(preds, axis=1)
y_preds = pred_classes.tolist()

In [95]:
y = test_df['Label'].to_list()

In [96]:
from sklearn.metrics import classification_report

print(classification_report(y, y_preds, target_names=captions))

                      precision    recall  f1-score   support

          AnnualCrop       0.94      0.67      0.79       300
              Forest       0.83      0.98      0.90       300
HerbaceousVegetation       0.00      0.00      0.00       300
             Highway       0.69      0.30      0.41       250
          Industrial       0.63      0.99      0.77       250
             Pasture       0.96      0.39      0.55       200
       PermanentCrop       0.63      0.89      0.74       250
         Residential       0.99      0.78      0.88       300
               River       0.36      0.92      0.52       250
             SeaLake       0.92      0.97      0.94       300

            accuracy                           0.69      2700
           macro avg       0.69      0.69      0.65      2700
        weighted avg       0.69      0.69      0.66      2700



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Proxy tuning works!! 🎉🎉

The base large model is able to get a accuracy boost of 49% post proxy-tuning 