# 导入文件名


In [None]:
import os
import sys
import numpy as np
import time
import copy
import matplotlib.pyplot as plt
%matplotlib inline 

#from skimage.io import imread
from PIL import Image
import torch, torchvision
from torch.utils.data import Dataset,DataLoader
from torchvision.transforms import Compose
from torchvision import transforms
from torchvision.models import resnet50,resnet18
from torchvision.datasets import ImageFolder
from torch.nn import functional as F

torch.cuda.is_available()
device=torch.device('cuda')

In [None]:
#指定路径,只要给保存的路径即可
training_data_path='/kaggle/input/10-monkey-species/training/training/'
valid_data_path='/kaggle/input/10-monkey-species/validation/validation/'

In [None]:
#写成文本文件
label=os.listdir(training_data_path)
label.sort()
with open('train.txt','w') as f:
    for l in range(len(label)):
        for filename in os.listdir(os.path.join(training_data_path,label[l])):
            line=f'{label[l]}/{filename} {l}\n'
            f.write(line)
with open('valid.txt','w') as f:
    for l in range(len(label)):
        for filename in os.listdir(os.path.join(valid_data_path,label[l])):
            line=f'{label[l]}/{filename} {l}\n'
            f.write(line)#重写数据类，可以将类名加入，也可以不加

In [None]:
'''
data_root='/kaggle/input/garbage-classification'
class Monkey(Dataset):
    CLASSES={'n0': 0, 'n1': 1,'n2': 2,'n3': 3, 'n4': 4,'n5': 5,'n6': 6,'n7': 7, 'n8': 8,'n9': 9}
    def __init__(self,data_root,ann_file,pipeline=None,classes=None):
        #数据名列表
        super().__init__()
        self.data_infos=[]
        self.labels=[]
        self.data_root=data_root
        self.pipeline=Compose(pipeline)
        #用到文本文件
        f=open(ann_file,'r')
        for line in f:
            filename,label=line.strip().split(' ')
            item = {'file':filename,'label':label}
            self.data_infos.append(item)
        self._len=len(self.data_infos)
    def __getitem__(self,index):
        info=self.data_infos[index]
        img_path=os.path.join(self.data_root,info['file'])
        img=Image.open(img_path)
        if self.pipeline:
            img=self.pipeline(img)     
        return img,int(info['label'])
    def __len__(self):
        return self._len  
#或者直接调用简单的分类数据集函数
mean = [110.508858, 109.552668,84.623747]
std = [67.212821,66.229520, 66.544232]

#训练集，使用ImageFolder可以更快速的构建分类数据集。数据集的分布只要按要求写好，再给定路径和增益方法即可
traindata=Monkey('/kaggle/input/10-monkey-species/training/training/',
              '/kaggle/working/train.txt',pipeline=[transforms.Resize(256),
      transforms.RandomCrop(224), transforms.RandomHorizontalFlip(),
         transforms.ToTensor(),transforms.Normalize(mean=mean,std=std)])#增益方法
trainloader= torch.utils.data.DataLoader(traindata, shuffle=True,
           num_workers=2,batch_size=64,pin_memory=True)
#验证集
validdata=Monkey('/kaggle/input/10-monkey-species/validation/validation/',
              '/kaggle/working/valid.txt',pipeline=[transforms.Resize((224,224),),
         transforms.ToTensor(),transforms.Normalize(mean=mean,std=std) ])
validloader=torch.utils.data.DataLoader(validdata,batch_size=64,shuffle=False)
'''  

In [None]:
#自定义增益函数
class Mytransform:
    
    '''
    这个差不多要消耗2s/128
    '''
    def __init__(self,shot_size):
        super().__init__()
        self.short = shot_size
    def __call__(self,data):
        h , w = np.array(data).shape[:2]
        s = min ( h , w )
        if s == h:
            scale = h/self.short
            h = self.short
            w = int(w/scale)
        else:
            scale = w/self.short
            w = self.short
            h = int(h/scale)
        #选择插值模式
        data = transforms.Resize((h,w))(data)
        return data  #data就是一张图

Augment=Compose([
        Mytransform(256),
        transforms.RandomCrop(224),#这里之前都是对PIL图片处理，mixup在train里做
        transforms.ToTensor(),#有归一化到【0，1】的作用，并且交换通道
        transforms.Normalize(mean=mean,std=std),
                ])  #进行数据增益的方法


#训练集，使用ImageFolder可以更快速的构建分类数据集。数据集的分布只要按要求写好，再给定路径和增益方法即可
traindata=ImageFolder(training_data_path,#数据集路径
                      transform=Augment)#增益方法
trainloader= torch.utils.data.DataLoader(traindata,
                                shuffle=True,
                                num_workers=4,
                                batch_size=64,
                                pin_memory=True)
#验证集
validdata=ImageFolder(valid_data_path,transform=Compose([transforms.Resize(256),
                                        transforms.CenterCrop(224),
                                        transforms.ToTensor(),
                                        transforms.Normalize(mean = mean,std = std),]))
validloader=torch.utils.data.DataLoader(validdata,
                                        shuffle=False,
                                       batch_size=64,)

In [None]:
#MODEL
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 保证输入大小不变，常用
torch.backends.cudnn.benchmark=True 

#预训练模型,nn.Conv2d中 bias=False 
model = resnet18(pretrained=True) 
num_ftrs = model.fc.in_features 
model.fc = torch.nn.Linear(num_ftrs, 10) #修改分类头
model = model.to(device)

In [None]:
#训练设置
#损失函数
criterion = torch.nn.CrossEntropyLoss()

#优化器和学习率调整
#optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
#exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# 超级加速
optimizer = torch.optim.AdamW(model.parameters(),lr=0.001,weight_decay=1e-4, amsgrad=False)
exp_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10)


In [8]:
epochs= 30 
for epoch in range(epochs):
    epoch_loss= 0.0
    epoch_count = 0
    model.train()
    print(f'Epoch {epoch}/{epochs}')
    print('----------------------------')
    
    start_time = time.time()
    
    for i,(img,label) in enumerate(trainloader):
        optimizer.zero_grad()
        img=img.to(device)
        label = label.to(device)
        prediction = model(img)
        _ , correct = torch.max(prediction,dim=1)
        loss = criterion(prediction,label)
        epoch_loss += loss.item()
        epoch_count += (correct==label).sum().item()
        
        loss.backward()
        optimizer.step()
        exp_lr_scheduler.step()
    
    one_epoch_time = time.time()-start_time
    
    if (epoch+1) % 3 == 0:
        
        model.eval()
        
        eval_loss=0.0
        eval_accuracy=0.0
            
        for image,label in validloader:
                
            image=image.to(device)
            label=label.to(device)
                
            with torch.no_grad():
                pred=model(image)
                
            loss = criterion(pred,label)
            _ , count = torch.max(pred,dim=1)
            
            eval_accuracy += (count==label).sum().item()
            eval_loss += loss.item()
        print (f'valid Loss: {eval_loss:.4f} Acc:{eval_accuracy /len(validdata):.4f}')
        
        model.train()
        
    print (f'one_epoch_time :{one_epoch_time:.4f},lr : {optimizer.param_groups[0]["lr"]},train Loss: {epoch_loss:.4f} Acc:{epoch_count/len(traindata):.4f} ')

IndentationError: expected an indented block (<ipython-input-8-e7cd90eb8294>, line 4)

In [None]:
#或者是
import time
dataset_sizes={'train':traindata,'val':validdata}
dataloaders={'train':trainloader,'val':validloader}
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / len(dataset_sizes[phase])
            epoch_acc = running_corrects.double() / len(dataset_sizes[phase])

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model



model= train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=25)

In [None]:
#保存模型
torch.save(model.state_dict(),'mymodel.pth')
#或者
torch.save(model,'allmodel')

# APEX

In [None]:
!git clone https://github.com/NVIDIA/apex
%cd apex
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

In [None]:
from apex import amp
#只要三行代码就可以

# Added after model and optimizer construction
model, optimizer = amp.initialize(model, optimizer, flags...)
...
# loss.backward() changed to:
with amp.scale_loss(loss, optimizer) as scaled_loss:
    scaled_loss.backward()

In [None]:
# Initialization
#O0：纯FP32训练，可以作为accuracy的baseline；
#O1：混合精度训练（推荐使用），根据黑白名单自动决定使用FP16（GEMM, 卷积）还是FP32（Softmax）进行计算。
#O2：“几乎FP16”混合精度训练，不存在黑白名单，除了Batch norm，几乎都是用FP16计算。
#O3：纯FP16训练，很不稳定，但是可以作为speed的baseline；

opt_level = 'O1'
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)

# Train your model
...
with amp.scale_loss(loss, optimizer) as scaled_loss:
    scaled_loss.backward()
...

# Save checkpoint
checkpoint = {
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict(),
    'amp': amp.state_dict()
}
torch.save(checkpoint, 'amp_checkpoint.pt')
...

# Restore
model = ...
optimizer = ...
checkpoint = torch.load('amp_checkpoint.pt')

model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
amp.load_state_dict(checkpoint['amp'])

# Continue training
...

# 或者使用torch.cuda.amp

In [None]:
from torch.cuda.amp import autocast, GradScaler

# amp依赖Tensor core架构，所以model参数必须是cuda tensor类型
model = Net().cuda()
optimizer = optim.SGD(model.parameters(), ...)
# GradScaler对象用来自动做梯度缩放
scaler = GradScaler()

for epoch in epochs:
    for input, target in data:
        optimizer.zero_grad()
        # 在autocast enable 区域运行forward
        with autocast():
            # model做一个FP16的副本，forward
            output = model(input)
            loss = loss_fn(output, target)
        # 用scaler，scale loss(FP16)，backward得到scaled的梯度(FP16)
        scaler.scale(loss).backward()
        # scaler 更新参数，会先自动unscale梯度
        # 如果有nan或inf，自动跳过
        scaler.step(optimizer)
        # scaler factor更新
        scaler.update()