In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import OrderedDict
from collections import namedtuple
from itertools import product

from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter

from IPython.display import display, clear_output
import pandas as pd
import time
import json

torch.set_printoptions(linewidth=120)
torch.set_grad_enabled(True)


<torch.autograd.grad_mode.set_grad_enabled at 0x15ff4e80e50>

In [35]:
class RunBuilder():
    @staticmethod
    def get_runs(params):

        Run = namedtuple('Run',params.keys())

        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))
        return runs

In [36]:
class RunManager():
    def __init__(self):
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None

        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None

        self.network = None
        self.loader = None
        self.tb = None

    def begin_run(self,run,network,loader):
        self.run_start_time = time.time()

        self.run_params = run
        self.run_count +=1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'--{run}')

        images,labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)

        self.tb.add_image('images',grid)
        self.tb.add_graph(
            self.network
            ,images.to(getattr(run,'device','cpu')))

    def end_run(self):
        self.tb.close()
        self.epoch_count = 0#为下一次做准备

    def begin_epoch(self):#每次重置epoch
        self.epoch_start_time = time.time()

        self.epoch_count +=1
        self.epoch_loss =0
        self.epoch_num_correct = 0

    def end_epoch(self):
        #epoch 的时间 run为运行总时长
        epoch_duration = time.time() -self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        #epoch的loss 和准确率
        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct/len(self.loader.dataset)

        #保存每次epoch的数据到tensorboard上 epoch_count 指明哪次epoch 
        self.tb.add_scalar('loss',loss,self.epoch_count)
        self.tb.add_scalar('Accuracy',accuracy,self.epoch_count)

        for name,param in self.network.named_parameters():
            self.tb.add_histogram(name,param,self.epoch_count)
            self.tb.add_histogram(f'{name}.grad',param.grad,self.epoch_count)


        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch_duration"] = epoch_duration
        results["run_duration"] = run_duration

        for k,v in self.run_params._asdict().items():
            results[k] = v

        self.run_data.append(results)

        df = pd.DataFrame.from_dict(self.run_data,orient = 'columns')

        clear_output(wait=True)
        display(df)
    def track_loss(self, loss, batch):
        self.epoch_loss += loss.item() * batch[0].shape[0]

    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
    
    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()

    def save(self, fileName):

        pd.DataFrame.from_dict(
            self.run_data, orient='columns'
        ).to_csv(f'{fileName}.csv')

        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)

In [2]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels =1,out_channels = 6,kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels =6,out_channels = 12,kernel_size=5)

        self.fc1 = nn.Linear(in_features=12*4*4,out_features=120)
        self.fc2 = nn.Linear(in_features=120,out_features=60)
        self.out = nn.Linear(in_features=60,out_features=10)

    def forward(self,t):
        #t = t #第一层输入
        t = F.relu(self.conv1(t))
        t = F.max_pool2d(t, kernel_size =2,stride=2)

        t = F.relu(self.conv2(t))
        t = F.max_pool2d(t,kernel_size = 2,stride =2)

        t = t.reshape(-1,12*4*4)
        t = F.relu(self.fc1(t))

        t = F.relu(self.fc2(t))

        t = self.out(t)

        return t


### Moving to GPU

In [3]:
t = torch.ones(1,1,28,28)
network = Network()

In [4]:
t = t.cuda()
network = network.cuda()

In [6]:
gpu_pred = network(t)
gpu_pred.device

device(type='cuda', index=0)

### 反复横跳 to CPU

In [7]:
t = t.cpu()
network = network.cpu()

In [8]:
cpu_pred = network(t)
cpu_pred.device

device(type='cpu')

### working with tensor

In [9]:
t1 = torch.tensor([
    [1,2],
    [3,4]
])

t2 = torch.tensor([
    [5,6],
    [7,8]
])

In [10]:
t1.device, t2.device

(device(type='cpu'), device(type='cpu'))

In [13]:
t1 = t1.to('cuda')
t1.device

device(type='cuda', index=0)

In [14]:
try: 
    t1+t2
except Exception as e: 
    print(e)

Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!


In [15]:
try: 
    t2+t1
except Exception as e: 
    print(e)
#进行运算时 期望第二个参数与第一个参数的device是相同的

Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!


In [16]:
t2 = t2.to('cuda')

In [17]:
t1+t2

tensor([[ 6,  8],
        [10, 12]], device='cuda:0')

### working with neural network modules

In [18]:
network = Network()

In [19]:
for name,param in network.named_parameters():
    print(name,'\t\t\t',param.shape)

conv1.weight 			 torch.Size([6, 1, 5, 5])
conv1.bias 			 torch.Size([6])
conv2.weight 			 torch.Size([12, 6, 5, 5])
conv2.bias 			 torch.Size([12])
fc1.weight 			 torch.Size([120, 192])
fc1.bias 			 torch.Size([120])
fc2.weight 			 torch.Size([60, 120])
fc2.bias 			 torch.Size([60])
out.weight 			 torch.Size([10, 60])
out.bias 			 torch.Size([10])


In [23]:
for n,p in network.named_parameters():
    print(p.device,'',n)

cpu  conv1.weight
cpu  conv1.bias
cpu  conv2.weight
cpu  conv2.bias
cpu  fc1.weight
cpu  fc1.bias
cpu  fc2.weight
cpu  fc2.bias
cpu  out.weight
cpu  out.bias


In [24]:
network.to('cuda')

Network(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (out): Linear(in_features=60, out_features=10, bias=True)
)

In [26]:
for n,p in network.named_parameters():
    print(p.device,'',n)

cuda:0  conv1.weight
cuda:0  conv1.bias
cuda:0  conv2.weight
cuda:0  conv2.bias
cuda:0  fc1.weight
cuda:0  fc1.bias
cuda:0  fc2.weight
cuda:0  fc2.bias
cuda:0  out.weight
cuda:0  out.bias


In [28]:
sample = torch.ones(1,1,28,28)
sample.shape

torch.Size([1, 1, 28, 28])

In [29]:
try:
    pred = network(sample)
except Exception as e:
    print(e)

Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor


In [31]:
try:
    pred = network(sample.to('cuda'))
    print(pred)
except Exception as e:
    print(e)

tensor([[ 0.0536,  0.0409, -0.0962, -0.0618,  0.0067,  0.0004, -0.0071,  0.1523, -0.0071,  0.0606]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


### checking for gpu

In [27]:
torch.cuda.is_available()

True

### using the GPU:TEST

In [37]:
train_set = torchvision.datasets.FashionMNIST(
    root='./data/'
    ,train=True
    ,download=True
    ,transform = transforms.Compose([
        transforms.ToTensor()
    ])
)

In [41]:
params = OrderedDict(
    lr = [.01]
    ,batch_size = [1000,10000,20000]
   # ,shuffle = [True,False]
    ,num_workers = [0,1]
    ,device = ['cuda','cpu']
)

m = RunManager()

for run in RunBuilder.get_runs(params):
    #------改动-------
    #network = Network()
    device = torch.device(run.device)
    network = Network().to(device)
    loader = DataLoader(train_set,batch_size = run.batch_size,num_workers=run.num_workers)
    optimizer = optim.Adam(network.parameters(),lr=run.lr)

    m.begin_run(run,network,loader)
    for epoch in range(1):
        m.begin_epoch()
        for batch in loader:
            #------------改动------------
            #images,labels = batch
            #因为要将 images 和labels都传进gpu images = betch[0].to(device)
            images = batch[0].to(device)
            labels = batch[1].to(device)
            preds = network(images)
            loss = F.cross_entropy(preds,labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            m.track_loss(loss,batch)
            m.track_num_correct(preds,labels)
        m.end_epoch()
    m.end_run()
m.save('results')
    


Unnamed: 0,run,epoch,loss,accuracy,epoch_duration,run_duration,lr,batch_size,num_workers,device
0,1,1,1.015932,0.6079,16.006018,16.971747,0.01,1000,0,cuda
1,2,1,1.039381,0.5979,14.61278,15.636095,0.01,1000,0,cpu
2,3,1,0.976098,0.631783,10.878562,13.843743,0.01,1000,1,cuda
3,4,1,0.955986,0.6343,12.89868,16.103339,0.01,1000,1,cpu
4,5,1,2.140763,0.182933,11.000007,18.505928,0.01,10000,0,cuda
5,6,1,2.221277,0.150433,13.049731,21.425449,0.01,10000,0,cpu
6,7,1,2.132476,0.197883,10.122982,19.83083,0.01,10000,1,cuda
7,8,1,2.129607,0.215767,10.961226,21.86236,0.01,10000,1,cpu
8,9,1,2.258637,0.194817,11.924463,22.432452,0.01,20000,0,cuda
9,10,1,2.289275,0.12325,13.810182,27.138647,0.01,20000,0,cpu


#### 为什么没有太大差距？可能我的cpu挺强！？  
为什么第一次训练的时间花费显著高于别的参数？可能需要预热？？？

In [42]:
pd.DataFrame.from_dict(m.run_data,orient = 'columns').sort_values('epoch_duration')

Unnamed: 0,run,epoch,loss,accuracy,epoch_duration,run_duration,lr,batch_size,num_workers,device
6,7,1,2.132476,0.197883,10.122982,19.83083,0.01,10000,1,cuda
10,11,1,2.265417,0.15055,10.463,26.216842,0.01,20000,1,cuda
2,3,1,0.976098,0.631783,10.878562,13.843743,0.01,1000,1,cuda
7,8,1,2.129607,0.215767,10.961226,21.86236,0.01,10000,1,cpu
4,5,1,2.140763,0.182933,11.000007,18.505928,0.01,10000,0,cuda
8,9,1,2.258637,0.194817,11.924463,22.432452,0.01,20000,0,cuda
11,12,1,2.2919,0.119133,12.433502,30.910857,0.01,20000,1,cpu
3,4,1,0.955986,0.6343,12.89868,16.103339,0.01,1000,1,cpu
5,6,1,2.221277,0.150433,13.049731,21.425449,0.01,10000,0,cpu
9,10,1,2.289275,0.12325,13.810182,27.138647,0.01,20000,0,cpu
