In [None]:
!pip install deepspeed --quiet
!pip install mpi4py --quiet

In [None]:
import os
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torch.jit import trace
from tqdm import tqdm
import deepspeed

[2025-04-25 12:39:31,260] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [None]:
# TODO: Use the same Kaggle code from HW1P2
os.makedirs("/root/.kaggle", exist_ok=True)

with open("/root/.kaggle/kaggle.json", "w+") as f:
    # Put your kaggle username & key here
    f.write('{"username":"tianyir","key":"e9c337dcee4fb617dcc1607ed3094a25"}')

!chmod 600 /root/.kaggle/kaggle.json

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("andrewmvd/isic-2019")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/isic-2019


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
NUM_WORKERS = 4
LR_HEAD = 1e-3
LR_BACKBONE = 1e-5
WEIGHT_DECAY = 1e-4
EPOCHS_LP = 10
EPOCHS_FT = 8
UNFREEZE_BLOCKS = 2

In [None]:
# --- Step 2: Organize Data ---
# Load metadata
img_dir  = "/root/.cache/kagglehub/datasets/andrewmvd/isic-2019/versions/1/ISIC_2019_Training_Input/ISIC_2019_Training_Input"
path = '/root/.cache/kagglehub/datasets/andrewmvd/isic-2019/versions/1'
df = pd.read_csv(f'{path}/ISIC_2019_Training_GroundTruth.csv')
df['image_id'] = df['image'] + '.jpg'

# Convert one-hot encoding to class labels
classes = ['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC']
df['dx'] = df[classes].idxmax(axis=1)
label_to_idx = {cls: idx for idx, cls in enumerate(classes)}

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2,
                                    stratify=df['dx'], random_state=42)

# --- Step 3: Define Dataset Class ---
class ISIC2019Dataset(Dataset):
    def __init__(self, df, img_dir='ISIC_2019_Training_Input/ISIC_2019_Training_Input', transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.label_to_idx = label_to_idx

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.df.iloc[idx]['image'])
        image = Image.open(img_path).convert('RGB')
        label = self.label_to_idx[self.df.iloc[idx]['label']]

        if self.transform:
            image = self.transform(image)

        return image, label

In [None]:
# meta_csv = "/root/.cache/kagglehub/datasets/andrewmvd/isic-2019/versions/1/ISIC_2019_Training_Metadata.csv"
# img_dir  = "/root/.cache/kagglehub/datasets/andrewmvd/isic-2019/versions/1"

# df = pd.read_csv(meta_csv)
# df['image_id'] = df['image_id'].apply(lambda x: f"{x}.jpg")
# df = df[df['dx'].notna()]

# train_df, val_df = train_test_split(
#     df, test_size=0.2, stratify=df['dx'], random_state=42
# )

In [None]:
train_tfms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])
val_tfms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

In [None]:
class HAM10000Dataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.classes = sorted(self.df['dx'].unique())
        self.class_to_idx = {c:i for i,c in enumerate(self.classes)}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = os.path.join(self.img_dir, row['image_id'])
        img = Image.open(path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        label = self.class_to_idx[row['dx']]
        return img, label

In [None]:
t_train = HAM10000Dataset(train_df, img_dir, transform=train_tfms)
t_val   = HAM10000Dataset(val_df,   img_dir, transform=val_tfms)
train_loader = DataLoader(t_train, batch_size=BATCH_SIZE, shuffle=True,  num_workers=NUM_WORKERS)
val_loader   = DataLoader(t_val,   batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
NUM_CLASSES = len(t_train.classes)



In [None]:
backbone = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14").to(device)
feat_dim = backbone.embed_dim
num_blocks = len(backbone.blocks)

head = nn.Linear(feat_dim, NUM_CLASSES).to(device)

class DinoClassifier(nn.Module):
    def __init__(self, backbone, head):
        super().__init__()
        self.backbone = backbone
        self.head = head
    def forward(self, x):
        feats = self.backbone(x)
        feats = self.backbone.norm(feats)
        return self.head(feats)


Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main


In [None]:
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for x, y in tqdm(loader, desc="Train", leave=False):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
    return total_loss / len(loader.dataset)

@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    correct = 0
    for x, y in tqdm(loader, desc="Eval", leave=False):
        x, y = x.to(device), y.to(device)
        pred = model(x).argmax(dim=1)
        correct += (pred == y).sum().item()
    return correct / len(loader.dataset)

def measure_inference_time(model, device, runs=100):
    model.eval()
    example = torch.randn(1,3,224,224).to(device)
    for _ in range(10): _ = model(example)
    torch.cuda.synchronize()
    start = torch.cuda.Event(enable_timing=True)
    end   = torch.cuda.Event(enable_timing=True)
    start.record()
    for _ in range(runs): _ = model(example)
    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end) / runs

In [None]:
# ========== Linear Probing ==========
for p in backbone.parameters(): p.requires_grad = False
for p in head.parameters():    p.requires_grad = True
model_lp = DinoClassifier(backbone, head).to(device)
optimizer_lp = optim.AdamW(head.parameters(), lr=LR_HEAD, weight_decay=WEIGHT_DECAY)
criterion = nn.CrossEntropyLoss()
print("=== Linear Probing Training ===")
for epoch in range(EPOCHS_LP):
    loss = train_epoch(model_lp, train_loader, optimizer_lp, criterion, device)
    acc  = evaluate(model_lp, val_loader, device)
    print(f"Epoch {epoch+1}/{EPOCHS_LP} — loss: {loss:.4f}, val_acc: {acc:.4%}")
acc_lp  = evaluate(model_lp, val_loader, device)
time_lp = measure_inference_time(model_lp, device)
torch.jit.trace(model_lp.eval(), torch.randn(1,3,224,224).to(device)).save("model_lp.ts")
size_lp = os.path.getsize("model_lp.ts")/1e6

=== Linear Probing Training ===




Epoch 1/10 — loss: 0.6867, val_acc: 77.1660%




Epoch 2/10 — loss: 0.6676, val_acc: 74.5214%




Epoch 3/10 — loss: 0.6583, val_acc: 76.5739%




Epoch 4/10 — loss: 0.6584, val_acc: 76.8305%




Epoch 5/10 — loss: 0.6540, val_acc: 76.8502%




Epoch 6/10 — loss: 0.6519, val_acc: 76.6923%




Epoch 7/10 — loss: 0.6386, val_acc: 77.2647%




Epoch 8/10 — loss: 0.6395, val_acc: 77.5015%




Epoch 9/10 — loss: 0.6427, val_acc: 76.6923%




Epoch 10/10 — loss: 0.6276, val_acc: 77.5607%




In [None]:
# ========== Partial Fine-tuning ==========
for p in backbone.parameters():  p.requires_grad = False
for blk in backbone.blocks[-UNFREEZE_BLOCKS:]:
    for p in blk.parameters(): p.requires_grad = True
for p in head.parameters(): p.requires_grad = True
params_ft = [
    {"params": head.parameters(), "lr": LR_HEAD},
    {"params": [p for p in backbone.parameters() if p.requires_grad], "lr": LR_BACKBONE},
]
optimizer_ft = optim.AdamW(params_ft, weight_decay=WEIGHT_DECAY)
print("=== Partial Fine-tuning ===")
for epoch in range(EPOCHS_FT):
    loss = train_epoch(model_lp, train_loader, optimizer_ft, criterion, device)
    acc  = evaluate(model_lp, val_loader, device)
    print(f"Epoch {epoch+1}/{EPOCHS_FT} — loss: {loss:.4f}, val_acc: {acc:.4%}")
acc_ft  = evaluate(model_lp, val_loader, device)
time_ft = measure_inference_time(model_lp, device)
ts_mod = torch.jit.trace(model_lp.eval(), torch.randn(1,3,224,224).to(device))
ts_mod.save("model_ft.ts")
size_ft = os.path.getsize("model_ft.ts")/1e6

=== Partial Fine-tuning ===




Epoch 1/8 — loss: 0.7303, val_acc: 78.1330%




Epoch 2/8 — loss: 0.6468, val_acc: 77.3239%




Epoch 3/8 — loss: 0.5717, val_acc: 77.3041%




Epoch 4/8 — loss: 0.5396, val_acc: 81.2710%




Epoch 5/8 — loss: 0.4784, val_acc: 81.8038%




Epoch 6/8 — loss: 0.4516, val_acc: 77.6791%




Epoch 7/8 — loss: 0.4162, val_acc: 83.5800%




Epoch 8/8 — loss: 0.3763, val_acc: 83.3827%




In [None]:
def evaluate_half(model, loader, device):
    model.eval()
    correct = 0
    for x, y in loader:
        x = x.to(device).half()       # 关键：输入也转为 half
        y = y.to(device)
        pred = model(x).argmax(dim=1)
        correct += (pred == y).sum().item()
    return correct / len(loader.dataset)

def measure_inference_time_half(model, device, runs=100):
    model.eval()
    example = torch.randn(1,3,224,224, device=device, dtype=torch.half)
    for _ in range(10): _ = model(example)
    torch.cuda.synchronize()
    start = torch.cuda.Event(enable_timing=True)
    end   = torch.cuda.Event(enable_timing=True)
    start.record()
    for _ in range(runs): _ = model(example)
    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end) / runs

In [None]:
ds_model_16qt = deepspeed.init_inference(
    model_lp,
    dtype=torch.float16,             # 量化到 float16
    replace_method='auto',           # 自动定位哪些层可量化
    replace_with_kernel_inject=True  # 注入高性能量化内核
)

# 3) 移到 GPU
ds_model_16qt.cuda().eval()

# 4) 直接用 ds_model_16qt 做推理
example = torch.randn(1, 3, 224, 224, device=device, dtype=torch.half)
ds_mod_16qt_trt = torch.jit.trace(ds_model_16qt, example)

torch.jit.save(ds_mod_16qt_trt, "model_ds_float16.ts")


ds_acc_16qt = evaluate_half(ds_model_16qt, val_loader, device)

def measure_inference_time_half(model, device, runs=100):
    model.eval()
    example = torch.randn(1,3,224,224, device=device, dtype=torch.half)
    for _ in range(10): _ = model(example)
    torch.cuda.synchronize()
    start = torch.cuda.Event(enable_timing=True)
    end   = torch.cuda.Event(enable_timing=True)
    start.record()
    for _ in range(runs): _ = model(example)
    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end) / runs

ds_time_16qt = measure_inference_time_half(ds_model_16qt, device)
ds_size_16qt = os.path.getsize("model_ds_float16.ts")/1e6

print("\n=== Deepspeed float16 Quantization Results ===")
print(f"Accuracy: {ds_acc_16qt:.4%}, Inference Time: {ds_time_16qt:.2f}ms, Model Size: {ds_size_16qt:.2f}MB")

[2025-04-25 15:22:52,430] [INFO] [logging.py:107:log_dist] [Rank -1] DeepSpeed info: version=0.16.7, git-hash=unknown, git-branch=unknown
[2025-04-25 15:22:52,467] [INFO] [logging.py:107:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1


Tensor-likes are not close!

Mismatched elements: 5 / 8 (62.5%)
Greatest absolute difference: 0.0078125 at index (0, 3) (up to 1e-05 allowed)
Greatest relative difference: 0.0015552099533437014 at index (0, 3) (up to 1e-05 allowed)
  _check_trace(



=== Deepspeed float16 Quantization Results ===
Accuracy: 83.3432%, Inference Time: 9.84ms, Model Size: 173.63MB


In [None]:
import torch
torch.cuda.empty_cache()
import gc
gc.collect()
# 1) Freeze entire backbone
for p in backbone.parameters():
    p.requires_grad = False

# 2) Unfreeze last UNFREEZE_BLOCKS transformer blocks
for blk in backbone.blocks[-UNFREEZE_BLOCKS:]:
    for p in blk.parameters():
        p.requires_grad = True

# 3) Unfreeze the classification head
for p in head.parameters():
    p.requires_grad = True

params_ft = [
    {"params": head.parameters(),                              "lr": LR_HEAD},
    {"params": [p for p in backbone.parameters() if p.requires_grad], "lr": LR_BACKBONE},
]
optimizer_ft = torch.optim.AdamW(params_ft, weight_decay=WEIGHT_DECAY)

model_engine = None
model_engine, optimizer_engine, _, _ = deepspeed.initialize(
    args=None,
    model= model_lp,                             # your nn.Module wrapper of backbone+head
    model_parameters=params_ft,
    config_params={
        "train_batch_size": BATCH_SIZE,
        "fp16": {"enabled": True},               # if you want mixed-precision
        "optimizer": {
            "type": "AdamW",
            "params": {"weight_decay": WEIGHT_DECAY}
        },
        "timers": {"enabled": False},
    }
)
for epoch in range(EPOCHS_FT):
    model_engine.train()
    total_loss = 0.0

    for x, y in train_loader:
        x, y = x.to(device).half(), y.to(device)
        model_engine.zero_grad()
        logits = model_engine(x)            # same as model_lp(x) under the hood
        loss = criterion(logits, y)
        model_engine.backward(loss)         # DeepSpeed-aware backward
        model_engine.step()                 # DeepSpeed step()
        total_loss += loss.item() * x.size(0)

    val_acc = evaluate_half(model_engine, val_loader, device)
    print(f"Epoch {epoch+1}/{EPOCHS_FT} — loss: {total_loss/len(train_loader.dataset):.4f}, val_acc: {val_acc:.4%}")

model_engine.eval()
# Example input must match the dtype you trained with (e.g., fp16 if fp16 enabled)
example = torch.randn(1, 3, 224, 224, device=device, dtype=torch.half)
ts_mod = torch.jit.trace(model_engine, example)  # ScriptModule generation :contentReference[oaicite:3]{index=3}
ts_mod.save("model_ft.ts")

ds_acc_16qt_ft = evaluate_half(model_engine, val_loader, device)
ds_time_16qt_ft = measure_inference_time_half(model_engine, device)
ds_size_16qt_ft = os.path.getsize("model_ft.ts") / 1e6

print("\n=== Partial Fine-Tuning Results ===")
print(f"Accuracy:      {ds_acc_16qt_ft:.4%}")
print(f"Latency:       {ds_time_16qt_ft:.2f} ms")
print(f"Model Size:    {ds_size_16qt_ft:.2f} MB")

[2025-04-25 15:24:19,518] [INFO] [logging.py:107:log_dist] [Rank -1] DeepSpeed info: version=0.16.7, git-hash=unknown, git-branch=unknown
[2025-04-25 15:24:19,519] [INFO] [comm.py:669:init_distributed] cdb=None
[2025-04-25 15:24:19,520] [INFO] [comm.py:684:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2025-04-25 15:24:20,253] [INFO] [comm.py:739:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.28.0.12, master_port=29500
[2025-04-25 15:24:20,254] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-04-25 15:24:20,311] [INFO] [config.py:735:__init__] Config mesh_device None world_size = 1
[2025-04-25 15:24:20,724] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False


Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


Time to load fused_adam op: 0.15436458587646484 seconds
[2025-04-25 15:24:20,886] [INFO] [logging.py:107:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
[2025-04-25 15:24:20,889] [INFO] [logging.py:107:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2025-04-25 15:24:20,891] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
[2025-04-25 15:24:20,892] [INFO] [logging.py:107:log_dist] [Rank 0] Creating fp16 optimizer with dynamic loss scale
[2025-04-25 15:24:20,903] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed Final Optimizer = FP16_Optimizer
[2025-04-25 15:24:20,903] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed using configured LR scheduler = None
[2025-04-25 15:24:20,905] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
[2025-04-25 15:24:20,906] [INFO] [logging.py:107:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001, 1e-05], mom=[(0.9, 0.999), (0.9, 0.99

Loading extension module fused_adam...


[2025-04-25 15:24:20,936] [INFO] [config.py:1007:print]   eigenvalue_verbose ........... False
[2025-04-25 15:24:20,937] [INFO] [config.py:1007:print]   elasticity_enabled ........... False
[2025-04-25 15:24:20,937] [INFO] [config.py:1007:print]   flops_profiler_config ........ {
    "enabled": false, 
    "recompute_fwd_factor": 0.0, 
    "profile_step": 1, 
    "module_depth": -1, 
    "top_modules": 1, 
    "detailed": true, 
    "output_file": null
}
[2025-04-25 15:24:20,938] [INFO] [config.py:1007:print]   fp16_auto_cast ............... False
[2025-04-25 15:24:20,938] [INFO] [config.py:1007:print]   fp16_enabled ................. True
[2025-04-25 15:24:20,939] [INFO] [config.py:1007:print]   fp16_master_weights_and_gradients  False
[2025-04-25 15:24:20,939] [INFO] [config.py:1007:print]   global_rank .................. 0
[2025-04-25 15:24:20,940] [INFO] [config.py:1007:print]   grad_accum_dtype ............. None
[2025-04-25 15:24:20,940] [INFO] [config.py:1007:print]   gradient_a



[2025-04-25 15:24:23,288] [INFO] [fused_optimizer.py:392:_update_scale] 
Grad overflow on iteration 0
[2025-04-25 15:24:23,291] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
[2025-04-25 15:24:23,299] [INFO] [logging.py:107:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 65536, reducing to 32768.0
[2025-04-25 15:24:23,512] [INFO] [fused_optimizer.py:392:_update_scale] 
Grad overflow on iteration 1
[2025-04-25 15:24:23,518] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0
[2025-04-25 15:24:23,521] [INFO] [logging.py:107:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0
Epoch 1/8 — loss: 0.3417, val_acc: 82.6722%
[2025-04-25 15:33:53,222] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 1000 iterations
[2025-04-25 15:33:53,240] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic lo

  assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
  assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
  if npatch == N and w == h:
  M = int(math.sqrt(N))  # Recover the number of patches in each dimension
  assert N == M * M
  sx = float(w0 + self.interpolate_offset) / M
  sy = float(h0 + self.interpolate_offset) / M
  assert (w0, h0) == patch_pos_embed.shape[-2:]
Tensor-likes are not close!

Mismatched elements: 6 / 8 (75.0%)
Greatest absolute difference: 0.0078125 at index (0, 3) (up to 1e-05 allowed)
Greatest relative difference: 0.0019120458891013384 at index (0, 2) (up to 1e-05 allowed)
  _check_trace(



=== Partial Fine-Tuning Results ===
Accuracy:      84.2510%
Latency:       11.96 ms
Model Size:    173.62 MB


In [None]:
print("\n=== Results Comparison ===")
print(f"{'Scheme':<40} {'Top1 Acc':>10}   {'Infer(ms)':>10}   {'Size(MB)':>8}")
print("-"*68)
print(f"{'1. Linear Probing':<40} {acc_lp*100:>9.2f}%   {time_lp:>10.2f}   {size_lp:>8.2f}")
print(f"{'2. Linear + Partial FT':<40} {acc_ft*100:>9.2f}%   {time_ft:>10.2f}   {size_ft:>8.2f}")
# print(f"{'3. + GPU INT8 Quantization':<40} {ds_acc_8qt*100:>9.2f}%   {ds_time_8qt:>10.2f}   {ds_size_8qt:>8.2f}")
print(f"{'3. + GPU FLOAT16 Quantization':<40} {ds_acc_16qt*100:>9.2f}%   {ds_time_16qt:>10.2f}   {ds_size_16qt:>8.2f}")
print(f"{'4. + GPU FLOAT16 Quantization FT':<40} {ds_acc_16qt_ft*100:>9.2f}%   {ds_time_16qt_ft:>10.2f}   {ds_size_16qt_ft:>8.2f}")


=== Results Comparison ===
Scheme                                     Top1 Acc    Infer(ms)   Size(MB)
--------------------------------------------------------------------
1. Linear Probing                            77.56%        20.40     346.79
2. Linear + Partial FT                       83.38%        20.35     346.79
3. + GPU FLOAT16 Quantization                83.34%         9.84     173.63
4. + GPU FLOAT16 Quantization FT             84.25%        11.96     173.62


In [None]:
model_lp.eval()
ds_model_8qt = deepspeed.init_inference(
    model_lp,
    mp_size=1,                       # 单 GPU
    dtype=torch.int8,                # 量化到 INT8
    replace_method='auto',           # 自动定位哪些层可量化
    replace_with_kernel_inject=True  # 注入高性能量化内核
)

# 3) 移到 GPU
ds_model_8qt.cuda()

# 4) 直接用 ds_model_8qt 做推理
example = torch.randn(1, 3, 224, 224, device=device)
ds_mod_8qt_trt = torch.jit.trace(ds_model_8qt, example)

torch.jit.save(ds_mod_8qt_trt, "model_ds_int8.ts")

ds_acc_8qt = evaluate(ds_model_8qt, val_loader, device)
ds_time_8qt = measure_inference_time(ds_model_8qt, device)
ds_size_8qt = os.path.getsize("model_ds_int8.ts")/1e6

print("\n=== Deepspeed int8 Quantization Results ===")
print(f"Accuracy: {ds_acc_8qt:.4%}, Inference Time: {ds_time_8qt:.2f}ms, Model Size: {ds_size_8qt:.2f}MB")

[2025-04-25 16:18:26,555] [INFO] [logging.py:107:log_dist] [Rank 0] DeepSpeed info: version=0.16.7, git-hash=unknown, git-branch=unknown


ValueError: Data type torch.int8 is not supported by cuda accelerator

In [None]:
# ========== TensorRT INT8 ==========
class CalibrationDataLoader(object):
    def __init__(self, dataloader, max_samples=100):
        self.dataloader = dataloader
        self.max_samples = max_samples
        self.current = 0
        self.images = []


        for images, _ in dataloader:
            self.images.extend(images)
            if len(self.images) >= max_samples:
                break
        self.images = self.images[:max_samples]

    def __iter__(self):
        self.current = 0
        return self

    def __next__(self):
        if self.current < len(self.images):
            data = self.images[self.current].to(device)
            self.current += 1
            return [data]
        else:
            raise StopIteration

calib_dataloader = CalibrationDataLoader(train_loader, max_samples=100)

ts_mod_trt = torch_tensorrt.compile(ts_mod,
    inputs = [torch_tensorrt.Input(
        min_shape=[1, 3, 224, 224],
        opt_shape=[BATCH_SIZE, 3, 224, 224],
        max_shape=[BATCH_SIZE*2, 3, 224, 224],
        dtype=torch.float32
    )],
    enabled_precisions={torch.int8}, # 启用INT8精度
    calibrator=torch_tensorrt.ptq.DataLoaderCalibrator(
        calib_dataloader,
        cache_file="./calibration.cache",
        use_cache=False,
        algo_type=torch_tensorrt.ptq.CalibrationAlgo.ENTROPY_CALIBRATION_2,
    ),
    workspace_size=1 << 22
)

torch.jit.save(ts_mod_trt, "model_trt_int8.ts")

acc_qt = evaluate(ts_mod_trt, val_loader, device)
time_qt = measure_inference_time(ts_mod_trt, device)
size_qt = os.path.getsize("model_trt_int8.ts")/1e6

print("\n=== Quantization Results ===")
print(f"Accuracy: {acc_qt:.4%}, Inference Time: {time_qt:.2f}ms, Model Size: {size_qt:.2f}MB")
