In [2]:
# imports
import matplotlib.pyplot as plt
import matplotlib
import joblib
import cv2
import os
import torch 
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
import random
import pretrainedmodels

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from torchvision.transforms import transforms
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

matplotlib.style.use('ggplot')

'''SEED Everything'''
def seed_everything(SEED=42):
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True # keep True if all the input have same size.
SEED=42
seed_everything(SEED=SEED)
'''SEED Everything'''

'SEED Everything'

In [3]:
if torch.cuda.is_available():
    device = 'cuda'
    print('using cuda')
else:
    device = 'cpu'
 
epochs = 25
BATCH_SIZE = 16

using cuda


In [6]:
fileNames = np.loadtxt('./train.txt', dtype=str, delimiter='\t')
np.random.shuffle(fileNames)

evalNames = np.loadtxt('./test.txt', dtype=str, delimiter='\t')
np.random.shuffle(evalNames)

# for i in range(len(fileNames)):
    # print(fileNames[i])
    # fileNames[i][0] = dataDir + fileNames[i][0]
print(fileNames)

[['./data/cifar10/train/horse/1072.png' '7']
 ['./data/cifar10/train/cat/2417.png' '3']
 ['./data/cifar10/train/dog/2157.png' '5']
 ...
 ['./data/cifar10/train/bird/4104.png' '2']
 ['./data/cifar10/train/truck/2775.png' '9']
 ['./data/cifar10/train/bird/3445.png' '2']]


In [7]:
t_data = []
t_labels = []
for i in range(len(fileNames)):
    label = fileNames[i][1]
    image = cv2.imread(fileNames[i][0])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
    t_data.append(image)
    t_labels.append(int(label))

t_data = np.array(t_data)
t_labels = np.array(t_labels)
print(t_labels)

[7 3 5 ... 2 9 2]


In [9]:
e_data = []
e_labels = []

for i in range(len(evalNames)):
    label = evalNames[i][1]
    image = cv2.imread(evalNames[i][0])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
    e_data.append(image)
    e_labels.append(int(label))

e_data = np.array(e_data)
e_labels = np.array(e_labels)

In [10]:
print(e_labels)

[5 0 7 ... 8 2 5]


In [11]:
# one hot encode
lb = LabelBinarizer()
labels = lb.fit_transform(t_labels)
e_labels = lb.fit_transform(e_labels)
t_labels = lb.fit_transform(t_labels)
# print(labels)
print(t_labels[0])
print(f"Total number of classes: {len(lb.classes_)}")

[0 0 0 0 0 0 0 1 0 0]
Total number of classes: 10


In [12]:
print(lb.inverse_transform(t_labels))

[7 3 5 ... 2 9 2]


In [13]:
# define transforms
train_transform = transforms.Compose(
    [transforms.ToPILImage(),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225])])
val_transform = transforms.Compose(
    [transforms.ToPILImage(),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225])])

In [15]:
# 可改变，尝试使用给出的Eval.txt和Test.txt
# divide the data into train, validation, and test set

x_test = e_data
y_test = e_labels


(x_train, x_val , y_train, y_val) = train_test_split(t_data, t_labels,
                                                    test_size=0.2,  
                                                    stratify=labels,
                                                    random_state=42)

print(f"x_train examples: {x_train.shape}\nx_test examples: {x_test.shape}\nx_val examples: {x_val.shape}")
print(type(x_train))
print(y_val)

x_train examples: (40000, 32, 32, 3)
x_test examples: (10000, 32, 32, 3)
x_val examples: (10000, 32, 32, 3)
<class 'numpy.ndarray'>
[[0 0 0 ... 1 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]]


In [17]:
# custom dataset
class ImageDataset(Dataset):
    def __init__(self, images, _labels=None, _transforms=None):
        self.X = images
        self.y = _labels
        self.transforms = _transforms
         
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, i):
        data = self.X[i][:]
        
        if self.transforms:
            data = self.transforms(data)
            
        if self.y is not None:
            return data, self.y[i]
        else:
            return data
        
train_data = ImageDataset(x_train, y_train, train_transform)
val_data = ImageDataset(x_val, y_val, val_transform)
test_data = ImageDataset(x_test, y_test, val_transform)

In [18]:
# dataloaders
trainloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False)
valloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
testloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [25]:
# the resnet34 model
class ResNet34(nn.Module):
    def __init__(self, pretrained):
        super(ResNet34, self).__init__()
        if pretrained is True:
            self.model = pretrainedmodels.__dict__['resnet34'](pretrained='imagenet')
        else:
            self.model = pretrainedmodels.__dict__['resnet34'](pretrained=None)
        
        # change the classification layer
        self.l0 = nn.Linear(512, len(lb.classes_))
        self.dropout = nn.Dropout2d(0.4)

    def forward(self, x):
        # get the batch size only, ignore (c, h, w)
        batch, _, _, _ = x.shape
        x = self.model.features(x)
        x = F.adaptive_avg_pool2d(x, 1).reshape(batch, -1)
        x = self.dropout(x)
        l0 = self.l0(x)
        return l0

model = ResNet34(pretrained=True).to(device)

In [20]:
# optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)
# loss function
criterion = nn.CrossEntropyLoss()

In [21]:
# training function
def fit(model, dataloader):
    print('Training')
    model.train()
    running_loss = 0.0
    running_correct = 0
    for i, data in tqdm(enumerate(dataloader), total=int(len(train_data)/dataloader.batch_size)):
        data, target = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, torch.max(target, 1)[1])
        running_loss += loss.item()
        _, preds = torch.max(outputs.data, 1)
        running_correct += (preds == torch.max(target, 1)[1]).sum().item()
        loss.backward()
        optimizer.step()
        
    loss = running_loss/len(dataloader.dataset)
    accuracy = 100. * running_correct/len(dataloader.dataset)
    
    print(f"Train Loss: {loss:.4f}, Train Acc: {accuracy:.2f}")
    
    return loss, accuracy

In [22]:
#validation function
def validate(model, dataloader):
    print('Validating')
    model.eval()
    running_loss = 0.0
    running_correct = 0
    with torch.no_grad():
        for i, data in tqdm(enumerate(dataloader), total=int(len(val_data)/dataloader.batch_size)):
            data, target = data[0].to(device), data[1].to(device)
            outputs = model(data)
            loss = criterion(outputs, torch.max(target, 1)[1])
            
            running_loss += loss.item()
            _, preds = torch.max(outputs.data, 1)
            running_correct += (preds == torch.max(target, 1)[1]).sum().item()
        
        loss = running_loss/len(dataloader.dataset)
        accuracy = 100. * running_correct/len(dataloader.dataset)
        print(f'Val Loss: {loss:.4f}, Val Acc: {accuracy:.2f}')
        
        return loss, accuracy

In [23]:
def test(model, dataloader):
    correct = 0
    total = 0
    with open('result_raw.txt', 'w') as out_file:
        with torch.no_grad():
            for data in testloader:
                inputs, target = data[0].to(device), data[1].to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                print(predicted)
                # print(torch.max(target, 1)[1])
                out_file.write(str(predicted))
                out_file.write('\n')
                total += target.size(0)
                correct += (predicted == torch.max(target, 1)[1]).sum().item()
 
    return correct, total

In [26]:
train_loss , train_accuracy = [], []
val_loss , val_accuracy = [], []
print(f"Training on {len(train_data)} examples, validating on {len(val_data)} examples...")
start = time.time()
for epoch in range(epochs):
    print(f"Epoch {epoch+1} of {epochs}")
    train_epoch_loss, train_epoch_accuracy = fit(model, trainloader)
    val_epoch_loss, val_epoch_accuracy = validate(model, valloader)
    train_loss.append(train_epoch_loss)
    train_accuracy.append(train_epoch_accuracy)
    val_loss.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)
end = time.time()
print((end-start)/60, 'minutes')
 
torch.save(model.state_dict(), f"resnet34_epochs{epochs}.pth")
 
# accuracy plots
plt.figure(figsize=(10, 7))
plt.plot(train_accuracy, color='green', label='train accuracy')
plt.plot(val_accuracy, color='blue', label='validataion accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('accuracy.png')
 
# loss plots
plt.figure(figsize=(10, 7))
plt.plot(train_loss, color='orange', label='train loss')
plt.plot(val_loss, color='red', label='validataion loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('loss.png')


Training on 40000 examples, validating on 10000 examples...
Epoch 1 of 25
Training


100%|██████████| 2500/2500 [01:04<00:00, 38.56it/s]


Train Loss: 0.1789, Train Acc: 9.80
Validating


100%|██████████| 625/625 [00:06<00:00, 92.83it/s]


Val Loss: 0.1699, Val Acc: 9.86
Epoch 2 of 25
Training


 62%|██████▏   | 1548/2500 [00:39<00:24, 39.31it/s]


KeyboardInterrupt: 

In [21]:
# save the accuracy and loss lists as pickled files
print('Pickling accuracy and loss lists...')
joblib.dump(train_accuracy, 'train_accuracy.pkl')
joblib.dump(train_loss, 'train_loss.pkl')
joblib.dump(val_accuracy, 'val_accuracy.pkl')
joblib.dump(val_loss, 'val_loss.pkl')

Pickling accuracy and loss lists...


['val_loss.pkl']

In [24]:
correct, total = test(model, testloader)
print(correct, total)
print('Accuracy of the network on test images: %0.3f %%' % (100 * correct / total))
print('train.py finished running')

tensor([11,  7,  1, 13,  3,  1,  6, 12, 15,  1,  6, 10,  1,  4,  7,  5],
       device='cuda:0')
tensor([12,  6,  7,  5,  2,  4, 11,  9, 11,  9,  2, 13, 10,  7,  3,  1],
       device='cuda:0')
tensor([15,  3,  4,  5,  1,  6,  8,  8, 12,  2,  4,  5, 10,  7,  7, 14],
       device='cuda:0')
tensor([11, 12, 14,  1, 15,  2, 10,  3, 11, 14,  5,  8,  1,  3, 14,  9],
       device='cuda:0')
tensor([ 9, 15,  3, 14, 14, 13, 12,  3, 12,  4,  6, 10,  6, 11,  0,  2],
       device='cuda:0')
tensor([15,  4, 13, 12,  6, 10,  2, 11,  6, 10,  3,  1,  9, 12,  1,  6],
       device='cuda:0')
tensor([ 8,  2, 12, 10, 15, 10,  5,  2,  1,  1,  9,  9,  7,  5, 11,  6],
       device='cuda:0')
tensor([ 0, 13, 11,  4,  6,  6, 12,  7,  1, 15,  1,  9,  4,  3,  8, 11],
       device='cuda:0')
tensor([ 5,  0, 12, 12,  1,  9, 11,  0, 12,  5, 15, 12,  6,  9,  1,  8],
       device='cuda:0')
tensor([ 0,  5,  1,  1,  3, 11,  7,  4, 10, 13, 14,  7,  6, 13,  3,  9],
       device='cuda:0')
tensor([14, 10,  4,  0, 15,  1

In [4]:
import re

pat1 = r'%s(.+?)%s' % ('\[', '\]')
p = re.compile(pat1, re.IGNORECASE)

with open('result_raw.txt', 'r') as in_file:
    with open('result.txt', 'w', encoding='utf-8', newline='\n') as out_file:
        lines = in_file.readlines()
        for line in lines:
            row = re.findall(p, line)
            if row:
                num_list = [str(x.strip()) for x in row[0].split(',')]
                for i in num_list:
                    out_file.write(i)
                    out_file.write('\n')
                    
print('parsing done.')

parsing done.


In [3]:
train_acc = joblib.load('train_accuracy.pkl')

In [4]:
print(train_acc)

[69.75111678366305, 96.936821952776, 99.617102744097, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]
