In [50]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
import numpy as np

# 设置图片序号
myidx = 204

# 加载 CIFAR-10 数据集
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()

# 数据预处理
train_images, test_images = train_images / 255.0, test_images / 255.0

# 构建 ResNet50 模型
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(32, 32, 3))
base_model.trainable = False  # 冻结模型的权重

model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(10, activation='softmax')
])

# 编译模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 设置训练周期
epochs = 10

# 在测试集上评估模型
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print(f'\nTest accuracy: {test_acc}')

# 使用模型进行预测
predictions = model.predict(test_images)

# 随机选择一张图片进行展示
index = myidx
sample_image = test_images[index]
sample_label = test_labels[index]

# 训练模型
model.fit(train_images, train_labels, epochs=epochs, validation_data=(test_images, test_labels), verbose=2)

# 在测试集上再次评估模型
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print(f'\nTest accuracy after {epochs} epochs: {test_acc}')

# 使用模型进行预测
predictions = model.predict(test_images)


313/313 - 7s - loss: 4.0168 - accuracy: 0.1001 - 7s/epoch - 23ms/step

Test accuracy: 0.10010000318288803
Epoch 1/10


2023-12-02 09:24:54.726281: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fa588645820 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-02 09:24:54.726306: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A30 MIG 1g.6gb, Compute Capability 8.0
2023-12-02 09:24:54.731718: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1701509094.846146    8844 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


1563/1563 - 34s - loss: 2.0820 - accuracy: 0.2407 - val_loss: 1.9780 - val_accuracy: 0.2742 - 34s/epoch - 22ms/step
Epoch 2/10
1563/1563 - 28s - loss: 1.9198 - accuracy: 0.3084 - val_loss: 1.8727 - val_accuracy: 0.3387 - 28s/epoch - 18ms/step
Epoch 3/10
1563/1563 - 28s - loss: 1.8654 - accuracy: 0.3325 - val_loss: 1.8127 - val_accuracy: 0.3570 - 28s/epoch - 18ms/step
Epoch 4/10
1563/1563 - 28s - loss: 1.8288 - accuracy: 0.3463 - val_loss: 1.7952 - val_accuracy: 0.3469 - 28s/epoch - 18ms/step
Epoch 5/10
1563/1563 - 24s - loss: 1.8043 - accuracy: 0.3550 - val_loss: 1.8068 - val_accuracy: 0.3450 - 24s/epoch - 15ms/step
Epoch 6/10
1563/1563 - 24s - loss: 1.7834 - accuracy: 0.3658 - val_loss: 1.7916 - val_accuracy: 0.3576 - 24s/epoch - 15ms/step
Epoch 7/10
1563/1563 - 24s - loss: 1.7677 - accuracy: 0.3704 - val_loss: 1.7854 - val_accuracy: 0.3571 - 24s/epoch - 15ms/step
Epoch 8/10
1563/1563 - 23s - loss: 1.7539 - accuracy: 0.3786 - val_loss: 1.7421 - val_accuracy: 0.3865 - 23s/epoch - 15ms/

In [48]:
import os
import clip
import torch
from torchvision.datasets import CIFAR10
import random

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Download the dataset
cifar10 = CIFAR10(root=os.path.expanduser("~/.cache"), download=True, train=False)


#设置计数器，看能预测对几个
count = 0
time=0
# 重复十次
for _ in range(10):
    # 选择随机的十个索引
    random_indices = random.sample(range(len(cifar10)), 10)
    
    # 在一个循环中遍历这些索引
    for myidx in random_indices:
        # Prepare the inputs
        image, class_id = cifar10[myidx]
        image_input = preprocess(image).unsqueeze(0).to(device)
        text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar10.classes]).to(device)

        # Calculate features
        with torch.no_grad():
            image_features = model.encode_image(image_input)
            text_features = model.encode_text(text_inputs)

        # Pick the top 5 most similar labels for the image
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk(1)

        # Print the result
        print("\nCLIP_Top predictions:\n")
        CLIP_pdt_label = ""
        for value, index in zip(values, indices):
            print(f"{cifar10.classes[index]:>16s}: {100 * value.item():.2f}%")
            CLIP_pdt_label = cifar10.classes[index]
        #print(CLIP_pdt_label)
        # 打印真实标签和RN50模型预测结果
        print(f"True label: {class_id}")
        print(f"True class: {cifar10.classes[class_id]}")
        
        #predicted_class = indices[0].item()
        #print(f"Predicted class: {cifar10.classes[predicted_class]}")
        time+=1
        if CLIP_pdt_label==cifar10.classes[class_id]:
            count += 1
        # 显示图片
        import matplotlib.pyplot as plt
        #plt.imshow(image)
        #plt.show()
print(count)
print(time)

Files already downloaded and verified

CLIP_Top predictions:

           horse: 98.83%
True label: 7
True class: horse

CLIP_Top predictions:

            bird: 98.73%
True label: 2
True class: bird

CLIP_Top predictions:

           horse: 98.05%
True label: 7
True class: horse

CLIP_Top predictions:

            deer: 99.12%
True label: 4
True class: deer

CLIP_Top predictions:

      automobile: 63.77%
True label: 9
True class: truck

CLIP_Top predictions:

            ship: 99.17%
True label: 8
True class: ship

CLIP_Top predictions:

            bird: 99.17%
True label: 2
True class: bird

CLIP_Top predictions:

           horse: 97.51%
True label: 7
True class: horse

CLIP_Top predictions:

            ship: 99.27%
True label: 8
True class: ship

CLIP_Top predictions:

            ship: 98.97%
True label: 8
True class: ship

CLIP_Top predictions:

           horse: 32.69%
True label: 7
True class: horse

CLIP_Top predictions:

           horse: 93.90%
True label: 7
True class: ho

In [52]:
#设计思路：通过设计实验，让clip对于cifar10进行zero-shot测试，随机测试集的样本进行测试
#可以发现，clip对于cifar10的准确率在86%左右
#容易出现错误的情况一般为horse和deer的混淆，以及对frog的预测不准。
#对于其中预测错误的样例，尝试进行人眼观察，发现均难以辨别，基本可以认为cifar10的预测水平堪比人类。
#以ResNet50作为基准模型，在十次训练后rn50的准确率在40%，远远不如clip10，并且训练消耗时间比较多

In [51]:
#CIFAR-10 is a widely used dataset for image classification tasks. 
#It consists of 60,000 32x32 color images in 10 different classes,
#with 6,000 images per class. The classes include common objects 
#such as airplanes, automobiles, birds, cats, and more.

In [53]:
#本次实验聚焦于clip在计算机视觉领域的应用，通过cifar10测试集评估clip的性能。我们发现，尽管
#clip注重于文本和图片之间的联系，但是他在计算机视觉领域的应用同样优秀。这表示泛化的模型有着非常光明的应用前景
#在训练成本高，训练集质量难以保证的现在，再加上美国对于我国的封锁，优秀的泛化模型能够为中国大模型的发展
#带来一丝曙光