In [4]:
import numpy as np
import torch
from torch import nn
from torchvision import transforms
import cv2 as cv
from torch.utils.data import Dataset, DataLoader
from typing import *
import sys
import os
import random
from utils import *
from net import resnet18
import pandas as pd
import math
from torch.nn import functional as F



runtime_path = sys.path[0]
runtime_path

os.chdir(runtime_path)

In [5]:
def try_all_GPUS() -> List[torch.device]:
    devices = [torch.device(f"cuda:{i}") for i in range(torch.cuda.device_count())]
    return devices if devices else torch.device("cpu")


devices = try_all_GPUS()
devices


[device(type='cuda', index=0)]

In [6]:
path_join = lambda *args: os.path.join(*args)


class CIFAR10_dataset(Dataset):
    def __init__(self, type_dataset: str = "train", vaild_rate=0.1) -> None:
        super().__init__()
        self.type_dataset = type_dataset
        self.vaild_rate = vaild_rate
        self.transform_train = transforms.Compose([
            transforms.ToTensor(),
            transforms.Resize(42),
            transforms.RandomResizedCrop(32, (0.6, 1.0), ratio=(0.8, 1.0)),
            transforms.RandomHorizontalFlip(p=0.8),
            transforms.Normalize([0.4914, 0.4822, 0.4465],  # normalize. 归一化.
                                 [0.2023, 0.1994, 0.2010])
        ])
        self.transform_vaild = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize([0.4914, 0.4822, 0.4465],  # normalize. 归一化.
                                         [0.2023, 0.1994, 0.2010])
                ])
        self.labels_dict = self.parse_csv2label()
        self.classes = ['airplane', 'automobile', 'bird', 'cat',
                        'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

        self.root_path = path_join(runtime_path, "train")
        if "train" == self.type_dataset:
            # generate train and vaild dataset file.
            self.shuffle_train_vaild()
            # only generate in "train" mode.

            with open("./train.txt", "r") as f:
                self.file_path_list = f.readlines()

        elif "vaild" == self.type_dataset:
            with open("./train_vaild.txt", "r") as f:
                self.file_path_list = f.readlines()
        else: # test:
            self.root_path = path_join(runtime_path, "test")

            self.file_path_list = [path_join(self.root_path, file_name) for file_name in os.listdir(self.root_path)]


    def __getitem__(self, index):
        file_path = self.file_path_list[index].strip()
        file_name = file_path.split("/")[-1].split(".")[0]
        img = cv.imread(file_path)
        if "train" == self.type_dataset:
            X = self.transform_train(img)
            return X, self.classes.index(self.labels_dict[file_name])
        elif "vaild" == self.type_dataset:
            X = self.transform_vaild(img)

            return X, self.classes.index(self.labels_dict[file_name])

        else: # test
            X = self.transform_vaild(img)
            return X, file_name

        


    def parse_csv2label(self):
        with open("./trainLabels.csv", "r") as f:
            return {ele[0]: ele[1] for ele in [line.strip().split(',') for line in f.readlines()][1:]}

    def shuffle_train_vaild(self):
        l = len(os.listdir(self.root_path))

        try:
            file_name_list = os.listdir(self.root_path)
            random.shuffle(file_name_list)

            with open(path_join(self.root_path, "../", "train.txt"), "w") as train_file_writer:
                train_file_writer.write(
                    "\n".join([path_join(self.root_path,  file_name)
                              for file_name in file_name_list[0: int(l*(1-self.vaild_rate))]])
                )

            with open(path_join(self.root_path, "../",  "train_vaild.txt"), "w") as train_file_writer:
                train_file_writer.write(
                    "\n".join([path_join(self.root_path,  file_name)
                              for file_name in file_name_list[int(l*(1-self.vaild_rate)):]])
                )

        except Exception as e:
            print("error: ", e)

    def __len__(self):
        return len(self.file_path_list)




def load_CIFAR10_iter(batch_size=64, num_workers=28):
    return DataLoader(
        CIFAR10_dataset("train"),
        batch_size,
        shuffle=True,
        num_workers=num_workers
    ),     DataLoader(
        CIFAR10_dataset("vaild"),
        batch_size,
        shuffle=True,
        num_workers=num_workers
    )


net = resnet18() # 实例化. 
net

init parameters used by xavier_uniform method. 


resnet18(
  (stage1): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
  )
  (stage2): Sequential(
    (0): Residual(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Residual(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn2): BatchNorm2d(64, eps=1e-05, mome

In [7]:
import torchvision

# net = torchvision.models.resnet34(pretrained=True)

# net = torchvision.models.resnet101(pretrained=True)

epoch = 80
batch_size = 512
lr = 5e-2
loss_fn = torch.nn.CrossEntropyLoss()
# train_cifar10(net, lr, batch_size, epoch)


In [8]:
# net.fc = nn.Linear(512, 10)
# print(net);

train_iter, vaild_iter = load_CIFAR10_iter(batch_size)
# iter(train_iter).__next__()

FileNotFoundError: [Errno 2] No such file or directory: '/home/wakinghours/programming/LiMu-DeepLearning/kaggle/cifar10/train'

In [None]:

def train_cos(
    net: nn.Module, loss_fn: nn.Module,
    train_iter:DataLoader, test_iter:DataLoader,
    lr, num_epoch,
    momentum=0.937, weight_decay=5e-4,
    load_path:str=None, devices=try_all_GPUS(),
):
    eps = 0.35
    net = nn.DataParallel(net, devices).to(devices[0])
    loss_fn.to(devices[0])
    if load_path:
        print("load net parameters: ", load_path)
        net.load_state_dict(torch.load(load_path)) # load parameters for net module. 
    # trainer = torch.optim.SGD(net.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
    trainer = torch.optim.Adam(net.parameters(), lr=lr)

    lf = lambda x: ((1 + math.cos(x * math.pi / num_epoch)) / 2) * (1 - eps) + eps 
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer=trainer, # 优化器.
        lr_lambda=lf, # 学习率函数. 
    ) # 根据lambda自定义学习率. 

    metric = Accumulator(3)
    trainer.zero_grad() # empty.
    # train:
    print("train on:", devices)
    for epoch in range(num_epoch):
        net.train()
        metric.reset() # 重置
        # train a epoch. 
        for i, (x, labels) in enumerate(train_iter):
            x, labels = x.to(devices[0]), labels.to(devices[0])
            y_hat = net(x)
            loss = loss_fn(y_hat, labels)

            trainer.zero_grad() # first empty gradient
            loss.sum().backward() # than calculate graient by backwoard (pro)
            trainer.step() # update weight. 
        
            metric.add(loss.sum(), accuracy(y_hat, labels), 1)
        
        scheduler.step() # we only update scheduler. 

        # evaluate
        if (epoch+1) % 10 == 0:
            test_accuracy = evaluate_test_with_GPUS(net, test_iter)
            print(epoch+1, "test acc:", test_accuracy, "train loss:", metric[0]/metric[-1], "train acc:", metric[1]/metric[-1])

            try:
                torch.save(net.state_dict(), f"./logs/epoch{epoch+1}_testacc{test_accuracy:4.3}_loss{metric[0]/metric[-1]:3.2}_acc{metric[1]/metric[-1]:.2}.pth")
            except:
                os.mkdir("./logs")
                torch.save(net.state_dict(), f"./logs/epoch{epoch+1}_testacc{test_accuracy:4.3}_loss{metric[0]/metric[-1]:3.2}_acc{metric[1]/metric[-1]:.2}.pth")




train_cos(net, loss_fn, train_iter, vaild_iter, lr, epoch)


# train(
#     net,
#     loss_fn,
#     train_iter, vaild_iter, 
#     lr, epoch, 
#     80, 10, 0.95, 8e-2,
#     # "                                                                                    logs/epoch200_testacc0.823_loss0.29_acc0.9.pth"
# )



train on: [device(type='cuda', index=0), device(type='cuda', index=1)]
10 test acc: 0.6994 train loss: 0.0013543997406959534 train acc: 0.7572
20 test acc: 0.7828 train loss: 0.0008369606859154171 train acc: 0.8506666666666667
30 test acc: 0.7912 train loss: 0.0005660829418235355 train acc: 0.8988
40 test acc: 0.807 train loss: 0.00040100363426738317 train acc: 0.9293777777777777
50 test acc: 0.8198 train loss: 0.00030241927206516267 train acc: 0.9480444444444445
60 test acc: 0.8196 train loss: 0.0002213421877887514 train acc: 0.9624444444444444


KeyboardInterrupt: 

In [None]:
test_datasets = CIFAR10_dataset("test")
test_iter = DataLoader(
    test_datasets,
    50000,
    shuffle=False,
    num_workers=30,
)

classes = test_datasets.classes
classes

In [None]:
def evel_test_iter(net, test_iter: DataLoader, load_path = None, devices=try_all_GPUS()):
    net = torch.nn.DataParallel(net, devices).to(device=devices[0])
    pred, id = [], []

    net.eval()
    with torch.no_grad():
        # result = [(classes[net(features).argmax(dim=1).cpu().numpy()[0]], int(file_name[0])) for features, file_name in test_iter]
        for features, file_name in test_iter:
            pred.extend(list(net(features).argmax(dim=1).cpu().numpy()))
            id.extend([int(ele) for ele in file_name])

        # print(result[0])
    
    result = zip(id, pred)
    retult = sorted(result, key=lambda x: x[0])
    df = pd.DataFrame({'id': [tup[0] for tup in retult], 'label':  [tup[1] for tup in retult]})
    df['label'] = df['label'].apply(lambda x: classes[x])
    df.to_csv('./submission.csv', index=False)
    

evel_test_iter(net, test_iter)


In [None]:
torch.cuda.empty_cache()