<a href="https://colab.research.google.com/github/Pengyu-gis/RemoteCLIP/blob/main/open_clip_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 准备数据与环境
使用的数据是Flickr 8k Dataset
数据集地址: https://www.kaggle.com/datasets/adityajn105/flickr8k/code

In [None]:
from google.colab import files

# 上传 kaggle.json
uploaded = files.upload()

# 确保 kaggle.json 被正确上传
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# 创建 kaggle 目录并移动 kaggle.json 到该目录
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

# 更改权限
!chmod 600 ~/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 68 bytes


In [None]:
!kaggle datasets download -d adityajn105/flickr8k
!unzip flickr8k.zip

In [3]:
!pip install open_clip_torch

Installing collected packages: ftfy, timm, open_clip_torch
Successfully installed ftfy-6.1.3 open_clip_torch-2.24.0 timm-0.9.16


## Test your settings
使用open_clip提供的接口来加载预训练的CLIP模型。可以选择一个适合您任务的模型版本

In [None]:
import torch
from PIL import Image
import open_clip

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
tokenizer = open_clip.get_tokenizer('ViT-B-32')

img_path = "/content/Images/1000268201_693b08cb0e.jpg"
image = preprocess(Image.open(img_path)).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


open_clip_pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Label probs: tensor([[0.3707, 0.2372, 0.3920]])


## 定义数据加载器
为了能够加载TIFF图像和对应的文本描述, 需要定义一个自定义的torch.utils.data.Dataset。以下是一个示例实现:

In [None]:
import os
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

class ImageTextDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None):
        # 使用pandas读取文本文件，假设字段之间是由逗号分隔的
        self.img_labels = pd.read_csv(annotations_file, delimiter=',')
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = Image.open(img_path).convert("RGB")  # 读取JPG文件
        caption = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        return image, caption

# 设置数据转换
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# 创建数据集和数据加载器
dataset = ImageTextDataset(annotations_file='/content/captions.txt',
                           img_dir='/content/Images',
                           transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


## 微调模型
一旦定义了数据加载器，就可以开始微调模型。这涉及到迭代数据加载器，将每批图像和文本送入模型，计算损失，并更新模型的权重。以下是微调过程的一个简化示例:

In [None]:
from torch import nn, optim, from_numpy
import numpy as np
from open_clip import tokenize

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"

# If the model isn't automatically moved to the correct device, explicitly do so
model = model.to(device)


# 假设已经定义了optimizer和loss function
optimizer = optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 3  # Example number of epochs

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (images, captions) in enumerate(dataloader):
        images = images.to("cuda")
        text_tokens = tokenize(captions).to("cuda")  # Ensure captions are properly processed if needed

        # Zero the parameter gradients
        optimizer.zero_grad()
        # Temporarily capture the entire output
        output = model(images, text_tokens)

        image_features, text_features, _ = output

        # Example: Calculating a simple similarity-based loss
        similarity = torch.nn.functional.cosine_similarity(image_features, text_features, dim=1)

        # Compute loss
        # loss = criterion(image_features, text_features)  # Placeholder, adjust as necessary
        loss = 1 - similarity.mean()  # Aiming to maximize similarity

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % 100 == 99:  # Print every 100 mini-batches
            print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 100}")
            running_loss = 0.0

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(dataloader)}")


[1, 100] loss: 0.024316540360450743
[1, 200] loss: 9.479224681854248e-05
[1, 300] loss: 5.9485435485839844e-05
[1, 400] loss: 3.934681415557861e-05
[1, 500] loss: 2.875804901123047e-05
[1, 600] loss: 2.2185444831848143e-05
[1, 700] loss: 1.7600059509277342e-05
[1, 800] loss: 1.4042258262634278e-05
[1, 900] loss: 1.1723041534423829e-05
[1, 1000] loss: 1.0088682174682617e-05
[1, 1100] loss: 8.603930473327636e-06
[1, 1200] loss: 7.09235668182373e-06
Epoch 1, Loss: 3.2417387830410077e-07
[2, 100] loss: 5.652308464050293e-06
[2, 200] loss: 5.199909210205078e-06
[2, 300] loss: 4.56392765045166e-06
[2, 400] loss: 4.236102104187011e-06
[2, 500] loss: 4.425644874572754e-06
[2, 600] loss: 4.649758338928223e-06
[2, 700] loss: 5.393624305725097e-06
[2, 800] loss: 4.380345344543457e-06
[2, 900] loss: 5.040168762207031e-06
[2, 1000] loss: 9.267926216125489e-06
[2, 1100] loss: 4.772543907165527e-06
[2, 1200] loss: 5.10871410369873e-06
Epoch 2, Loss: 2.3997348287831184e-07
[3, 100] loss: 8.63671302795

In [None]:
# Save the model's state dictionary
torch.save(model.state_dict(), '/content/model_save/model_state_dict.pth')

In [None]:
# Load the entire model
model = torch.save(model, '/content/model_save/complete_model.pth')
model.eval()  # Set the model to evaluation mode

In [None]:
# Load the model state
model.load_state_dict(torch.load('/content/model_save/model_state_dict.pth'))

# Move the model to evaluation mode
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [1]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
!cp /content/model_save/model_state_dict.pth /content/drive/MyDrive/


## Test model for zero-shot

In [4]:
import torch
import open_clip

# Define the model architecture (should be the same as used for training)
device = "cuda" if torch.cuda.is_available() else "cpu"
# model, preprocess = clip.load("ViT-B/32", device=device)  # Assuming you used ViT-B/32 for training
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')

# Load the state dictionary
model.load_state_dict(torch.load('/content/gdrive/MyDrive/my_clip/model_state_dict.pth'))

# Move model to evaluation mode
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


open_clip_pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-11): 12 x ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine

In [5]:
from PIL import Image

# Load and preprocess an image
image_path = '/content/test_Image.jpg'
image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)


In [9]:
if image.device != next(model.parameters()).device:
    model.to(image.device)

In [7]:
text_descriptions = ["a photo of a cat", "a photo of a dog", "a photo of a car"]  # Example labels
text_tokens = open_clip.tokenize(text_descriptions).to(device)


In [10]:
with torch.no_grad():  # No gradients needed for inference
    # Forward pass
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_tokens)

    # Calculate similarities
    similarities = (image_features @ text_features.T).softmax(dim=-1)
    predicted_indices = similarities.argmax(dim=-1)

    # Print the most similar text description
    for index in predicted_indices:
        print(f"Predicted description: {text_descriptions[index]}")


Predicted description: a photo of a cat
