基础处理

In [1]:
# Import necessary packages.
import numpy  as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision
from PIL import Image
# "ConcatDataset" and "Subset" are possibly useful when doing semi-supervised learning.
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, VisionDataset
# This is for the progress bar.
from tqdm.auto import tqdm
import random
print(torch.__version__)
#设置随机数种子保证结果可复现
myseed = 6666  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

print([0 for i in range(5)])

2.5.1+cu124
[0, 0, 0, 0, 0]


Data Augmentation

In [3]:
test_tfm = transforms.Compose([transforms.Resize((128,128)),transforms.ToTensor(),transforms.Normalize(mean=[0.5,0.5,0.5],std=[0.5,0.5,0.5])])
train_tfm = transforms.Compose([
    transforms.RandomResizedCrop(128,scale=(0.8,1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5],std=[0.5,0.5,0.5])
    ])

DataSet

In [4]:
class FoodDataset(Dataset):
    def __init__(self,path,files = None,tfm=test_tfm):
        super().__init__() #调用父类初始化方法，初始化父类里的一些参数
        self.path = path #数据集目录        
        self.files = sorted([os.path.join(path,x) for x in os.listdir(path) if x.endswith(".jpg")])
        if files != None:
            self.files = files
        self.transform = tfm

    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, index): #Dataloader取图片时会调用此函数
        #return super().__getitem__(index)
        fname = self.files[index]
        img = Image.open(fname)
        img = self.transform(img)
        try:
            label = int(fname.split("/")[-1].split("_")[0]) #得到图片的编号（对应类别）
        except:
            label = -1

        return img,label

Model

In [5]:
class Classifier(nn.Module):
    def __init__(self,):
        super().__init__()
        #input [3,128 128]
        self.cnn = nn.Sequential(
            nn.Conv2d(3,64,3,1,1), #in_channel,output_channel,kernel_size,stride,pooling[64,128,128]
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),    #kernel_size,stride,padding[64,64,64]

            nn.Conv2d(64,128,3,1,1),#[128,64,64]
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),#[128,32,32]

            nn.Conv2d(128,256,3,1,1),#[256,32,32]
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),#[256,16,16]

            nn.Conv2d(256,512,3,1,1),#[512,16,16]
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),#[512,8,8]

            nn.Conv2d(512,512,3,1,1),#[512,8,8]
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),#[512,4,4]

         )
        
        self.fc = nn.Sequential(
            nn.Linear(512*4*4,1024),
            nn.ReLU(),
            nn.Linear(1024,512),
            nn.ReLU(),
            nn.Linear(512,11)
        )
        #这里没有softmax的原因是,pytorch crossentropy函数中内置了softmax
    
    def forward(self,x):
        out = self.cnn(x) #out中算上了batch [batch,c,w,h]
        out = out.view(out.size()[0], -1) #展平操作后输入全连接层，3维至二维[batch_size,c*w*h]
        return  self.fc(out)

config

In [6]:
batch_size = 64
dataset_dir = "/root/autodl-tmp/food11"
model_save_dir = "/root/HW3/best_param.ckpt"
result_save_dit = "/root/HW3/submission.csv"
device = "cuda" if torch.cuda.is_available() else "cpu"
#device = "cpu"
n_epochs = 60
patience = 300
k_fold = 10

data loader

In [18]:

train_set =    FoodDataset(os.path.join(dataset_dir,"training"),tfm = train_tfm)
train_loader = DataLoader(train_set,batch_size = batch_size,shuffle = True,num_workers=0, pin_memory=True)
valid_set =    FoodDataset(os.path.join(dataset_dir,"validation"), tfm=test_tfm)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
test_set =     FoodDataset(os.path.join(dataset_dir,"test"),tfm = test_tfm)
test_loader  = DataLoader(test_set,batch_size=batch_size,shuffle=False,num_workers=0, pin_memory=True)
cross_dataset = ConcatDataset([train_set,valid_set])
cross_loader = DataLoader(cross_dataset,batch_size=batch_size,shuffle=True,num_workers=0,pin_memory=True)
#print(len(cross_loader))
np.random.seed(myseed)
dataloader_index = np.arange(len(cross_dataset))
np.random.shuffle(dataloader_index)
print(dataloader_index,len(dataloader_index))
torch.cuda.init()  # 重新初始化 CUDA runtime
print(torch.cuda.is_available())
model = torchvision.models.resnet18(weights=None)
model.fc = nn.Linear(model.fc.in_features,11)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=3e-4,weight_decay=1e-5)
stale,best_acc=0,0


[ 3617  2521  3855 ... 10335  3857 12209] 13296
True


RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [13]:
def trainer(train_loader,valid_loader,k):

    for epoch in range(n_epochs):
        model.train()
        train_loss = []
        train_accs = []
        # 创建训练进度条，只初始化一次
        train_pbar = tqdm(train_loader, position=0, leave=True, desc=f"Train Pro|Fold{k}| Epoch {epoch+1}/{n_epochs}")
    
        for batch in train_pbar:
            imgs, labels = batch

            logits = model(imgs.to(device))  # Forward pass

            loss = criterion(logits, labels.to(device))
            optimizer.zero_grad()  # 清除上一步梯度
            loss.backward()  # 计算当前梯度
            grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
            optimizer.step()  # 更新模型参数

            acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

            train_loss.append(loss.item())  # 保存标量值
            train_accs.append(acc.item())  # 保存标量值

            # 更新进度条的附加信息
            train_pbar.set_postfix({'train loss': loss.item(), 'train acc': acc.item()})      
        
        # 计算训练损失和准确度均值
        train_loss_mean = sum(train_loss) / len(train_loss)
        train_accs_mean = sum(train_accs) / len(train_accs)
        #writer.add_scalar("Accuracy/Train",train_accs_mean,epoch)

        valid_loss = []
        valid_accs = []
        model.eval()  # 设置模型为评估模式
        # 创建验证进度条，确保只初始化一次
        valid_pbar = tqdm(valid_loader, position=0, leave=True, desc=f"Valid Pro|Fold{k}| Epoch {epoch+1}/{n_epochs}")

        for batch in valid_pbar:
            imgs, labels = batch
            with torch.no_grad():
                logits = model(imgs.to(device))

            loss = criterion(logits, labels.to(device))
            acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

            valid_loss.append(loss.item())  # 保存标量值
            valid_accs.append(acc.item())  # 保存标量值
            # 更新进度条的附加信息
            valid_pbar.set_postfix({'valid loss': loss.item(), 'valid acc': acc.item()})
        
        # 计算验证损失和准确度均值
        valid_loss_mean = sum(valid_loss) / len(valid_loss)
        valid_accs_mean = sum(valid_accs) / len(valid_accs)
        #writer.add_scalar("Accuracy/Valid",valid_accs_mean,epoch)
        # 输出当前 epoch 的结果
        print(f'[Epoch:{epoch+1:03d}/{n_epochs:03d}] train:loss={train_loss_mean:.5f},acc={train_accs_mean:.5f}|valid:loss={valid_loss_mean:.5f},acc={valid_accs_mean:.5f}')

        # 保存最佳模型
        if valid_accs_mean > best_acc:
            best_acc = valid_accs_mean
            print(f'Best model found at epoch:{epoch+1}, saving model')
            torch.save(model.state_dict(), f"/root/HW3/KF-SUB/parm_fold{k}.ckpt")
            stale = 0
        else:
            stale += 1
            if stale > patience:
                print("Training convergence")
                break

In [14]:
for k in range(k_fold):
    #k折时valid  dataset index
    start = k*1330
    end   = min((k+1)*1330,13296)
    valid_index = dataloader_index[start:end]
    train_index = np.concatenate((dataloader_index[:start],dataloader_index[end:]))
    cross_valid_set = Subset(cross_dataset,valid_index)
    cross_train_set = Subset(cross_dataset,train_index)
    cross_train_loader = DataLoader(cross_train_set,batch_size=batch_size,shuffle=True,num_workers=0,pin_memory=True)
    cross_valid_loader = DataLoader(cross_valid_set,batch_size=batch_size,shuffle=False,num_workers=0,pin_memory=True)
    trainer(cross_train_loader,cross_valid_loader,k)



Train Pro|Fold0| Epoch 1/60:   0%|          | 0/187 [00:00<?, ?it/s]

RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
