# Pytorch Intro

purpose of this notebook is to walk through the pytorch intro tutorials located here:


http://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html



In [6]:
from __future__ import print_function
import torch

In [8]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x111803cd0>

In [2]:
# construct an uninitialized 5x3 matrix

x = torch.Tensor(5, 3)
print(x)


 0.0000e+00  2.0000e+00  0.0000e+00
 2.0000e+00  5.2241e+22  6.4462e-10
 1.0682e-05  5.3930e-05  6.4535e-10
 3.2798e-09  5.2944e+22  2.7450e-06
 4.3247e-05  1.6821e-04  0.0000e+00
[torch.FloatTensor of size 5x3]



In [3]:
# creating a randomly initialized matrix
x = torch.rand(5, 3)
print(x)


 0.0411  0.8415  0.1255
 0.7737  0.8568  0.4041
 0.4328  0.9454  0.2446
 0.3803  0.3818  0.6631
 0.2154  0.1732  0.5112
[torch.FloatTensor of size 5x3]



In [4]:
#tensor size

print(x.size())


torch.Size([5, 3])


In [6]:
#tensor addition syntax 1

y = torch.rand(5, 3)
print(x + y)


 0.4349  1.0159  0.8791
 0.9203  1.1791  0.5712
 1.3673  1.0019  0.4322
 1.3511  0.3992  1.0272
 0.4649  0.6296  0.5653
[torch.FloatTensor of size 5x3]



In [7]:
#tensor addition syntax 2

print(torch.add(x, y))


 0.4349  1.0159  0.8791
 0.9203  1.1791  0.5712
 1.3673  1.0019  0.4322
 1.3511  0.3992  1.0272
 0.4649  0.6296  0.5653
[torch.FloatTensor of size 5x3]



In [9]:
#saving the output of an addition to an another tensor...

result = torch.Tensor(5, 3)
torch.add(x, y, out=result)
print(result)


 0.4349  1.0159  0.8791
 0.9203  1.1791  0.5712
 1.3673  1.0019  0.4322
 1.3511  0.3992  1.0272
 0.4649  0.6296  0.5653
[torch.FloatTensor of size 5x3]



In [10]:
# "in place" addition

# not sure what this is...?

y.add_(x)
print(y)


 0.4349  1.0159  0.8791
 0.9203  1.1791  0.5712
 1.3673  1.0019  0.4322
 1.3511  0.3992  1.0272
 0.4649  0.6296  0.5653
[torch.FloatTensor of size 5x3]



In [11]:
# you can use numpy indexing:

print(x[:, 1])


 0.8415
 0.8568
 0.9454
 0.3818
 0.1732
[torch.FloatTensor of size 5]



In [12]:
# if you want to resize the tensor... you can use .view

x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
print(x.size(), y.size(), z.size())

torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])


In [13]:
# converting a tensor to a numpy array

a = torch.ones(5)
print(a)


 1
 1
 1
 1
 1
[torch.FloatTensor of size 5]



In [14]:
b = a.numpy()
print(b)

[ 1.  1.  1.  1.  1.]


In [15]:
# manually adding a scalar...

a.add_(1)
print(a)
print(b)


 2
 2
 2
 2
 2
[torch.FloatTensor of size 5]

[ 2.  2.  2.  2.  2.]


In [None]:
# all tensors on CPU support converting to Numpy & back

In [17]:
# tensors can be moved onto the GPU using the .cuda method...

# let us run this cell only if CUDA is available
if torch.cuda.is_available():
    x = x.cuda()
    y = y.cuda()
    x + y
else:
    print("not available")

not available


In [None]:
## computing gradients with the autograd.variable


In [18]:
import torch
from torch.autograd import Variable

In [19]:
#create a variable

x = Variable(torch.ones(2, 2), requires_grad=True)
print(x)

Variable containing:
 1  1
 1  1
[torch.FloatTensor of size 2x2]



In [20]:
# do an operation...

y = x + 2
print(y)

Variable containing:
 3  3
 3  3
[torch.FloatTensor of size 2x2]



In [21]:
# y was created as the result of an operation, so apparently, this gives it a .grad function...

print(y.grad_fn)

<AddBackward0 object at 0x1110f0950>


In [22]:
# do some more operations on our variable y

z = y * y * 3
out = z.mean()

print(z, out)

Variable containing:
 27  27
 27  27
[torch.FloatTensor of size 2x2]
 Variable containing:
 27
[torch.FloatTensor of size 1]



In [23]:
# this is apparently enough to do backprop? haha

# also apparently out.backward() is equivalent to doing out.backward(torch.Tensor([1.0])), whatever that means?

out.backward()

In [26]:
#print gradients d(out)/dx

print(x.grad)

#actually no way... it propagated the error back to x!

Variable containing:
 4.5000  4.5000
 4.5000  4.5000
[torch.FloatTensor of size 2x2]



In [27]:
# apparently you can do many crazy things with autograd... ?

x = torch.randn(3)
x = Variable(x, requires_grad=True)

y = x * 2
while y.data.norm() < 1000:
    y = y * 2

print(y)



Variable containing:
 586.2022
-835.8894
 623.2039
[torch.FloatTensor of size 3]



In [28]:
gradients = torch.FloatTensor([0.1, 1.0, 0.0001])
y.backward(gradients)

print(x.grad)

Variable containing:
  102.4000
 1024.0000
    0.1024
[torch.FloatTensor of size 3]



## stepping through Siraj's example here:

based off the video: https://www.youtube.com/watch?v=nbJ-2G2GXL0

code located here: https://github.com/llSourcell/pytorch_in_5_minutes/blob/master/demo.py

In [3]:
# Code in file autograd/two_layer_net_autograd.py
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Variables; these
  # are exactly the same operations we used to compute the forward pass using
  # Tensors, but we do not need to keep references to intermediate values since
  # we are not implementing the backward pass by hand.
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  
  # Compute and print loss using operations on Variables.
  # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
  # (1,); loss.data[0] is a scalar value holding the loss.

  #pow 2 , means to the power of 2

  # sum() , means sigma, or the sum of all squared error

  loss = (y_pred - y).pow(2).sum()
  print(t, loss.data[0])
  
  # Use autograd to compute the backward pass. This call will compute the
  # gradient of loss with respect to all Variables with requires_grad=True.
  # After this call w1.grad and w2.grad will be Variables holding the gradient
  # of the loss with respect to w1 and w2 respectively.
  loss.backward()

  # Update weights using gradient descent; w1.data and w2.data are Tensors,
  # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
  # Tensors.
  w1.data -= learning_rate * w1.grad.data
  w2.data -= learning_rate * w2.grad.data

  # Manually zero the gradients 
  w1.grad.data.zero_()
  w2.grad.data.zero_()

0 26776602.0
1 21948224.0
2 21937342.0
3 23310238.0
4 23591656.0
5 21099372.0
6 16137400.0
7 10619088.0
8 6342823.0
9 3672775.5
10 2203835.5
11 1426060.625
12 1007904.375
13 768706.875
14 619983.1875
15 518579.4375
16 443748.4375
17 385224.15625
18 337521.34375
19 297731.5
20 263979.46875
21 234988.203125
22 209924.265625
23 188118.453125
24 169089.78125
25 152380.65625
26 137657.84375
27 124625.132812
28 113053.5625
29 102750.523438
30 93557.015625
31 85327.9375
32 77950.5625
33 71321.8671875
34 65356.4648438
35 59977.5507812
36 55116.34375
37 50717.3554688
38 46721.9765625
39 43088.3789062
40 39779.8046875
41 36763.5703125
42 34008.4179688
43 31488.9882812
44 29183.4472656
45 27068.1621094
46 25127.0820312
47 23344.2011719
48 21703.6523438
49 20194.3847656
50 18803.2460938
51 17520.0898438
52 16334.3710938
53 15238.828125
54 14225.7900391
55 13287.9345703
56 12418.875
57 11613.4863281
58 10866.3476562
59 10172.5673828
60 9527.66699219
61 8928.09472656
62 8370.25292969
63 7850.9956054

451 0.000550157856196
452 0.000535813742317
453 0.00052302289987
454 0.000509206729475
455 0.000496660650242
456 0.000484028540086
457 0.000472314655781
458 0.000461605493911
459 0.000451286061434
460 0.000440312811406
461 0.000428983563324
462 0.000419259158662
463 0.000410815322539
464 0.000400721270125
465 0.000390926899854
466 0.000382439146051
467 0.000373218354071
468 0.000365353538655
469 0.000357971322956
470 0.000350604212144
471 0.000341815670254
472 0.000335044867825
473 0.000328157533659
474 0.000320628460031
475 0.0003136768064
476 0.000307299080305
477 0.000299989798805
478 0.00029420608189
479 0.000288831419311
480 0.000283049943391
481 0.000277026556432
482 0.000270528544206
483 0.000265860027866
484 0.0002605637128
485 0.000255017483141
486 0.00025089745759
487 0.000245669594733
488 0.000239930857788
489 0.000235977640841
490 0.000231697908021
491 0.000226728356211
492 0.000222518588998
493 0.000218507106183
494 0.000215204505366
495 0.000210230529774
496 0.00020632005

In [4]:
print(x)

Variable containing:
-4.8052e-02 -4.0543e-02  4.4600e-01  ...  -1.7186e+00  1.9580e-01  2.6152e-01
-7.3047e-01 -3.8490e-01  1.7806e+00  ...  -2.1728e+00  3.4729e-01 -1.2514e-01
-4.9727e-01 -7.2018e-01 -1.4319e+00  ...   8.4825e-01 -8.2667e-02 -8.0257e-04
                ...                   ⋱                   ...                
-1.1457e+00  8.7169e-01 -1.9630e+00  ...   7.2256e-02 -2.4460e-01 -5.8085e-01
-1.2080e+00 -8.3331e-01  8.0788e-01  ...  -2.3028e-01 -8.0669e-02  1.8830e+00
 1.3251e-01 -8.6802e-02  3.2665e-01  ...   2.4901e+00  1.1503e+00  2.8159e-01
[torch.FloatTensor of size 64x1000]



## exploring Computation Graphs and Automatic Differentiation

In [12]:
# Variables wrap tensor objects
x = autograd.Variable(torch.Tensor([1., 2., 3]), requires_grad=True)
# You can access the data with the .data attribute
print(x.data)

# You can also do all the same operations you did with tensors with Variables.
y = autograd.Variable(torch.Tensor([4., 5., 6]), requires_grad=True)
z = x + y
print(z.data)

# BUT z knows something extra.
print(z.grad_fn)

#print(x.grad_fn)


 1
 2
 3
[torch.FloatTensor of size 3]


 5
 7
 9
[torch.FloatTensor of size 3]

<AddBackward1 object at 0x1153ce450>


In [13]:
# so, variables objects know what created them... but how does that help us create a gradient?

In [14]:
# Lets sum up all the entries in z
s = z.sum()
print(s)
print(s.grad_fn)

Variable containing:
 21
[torch.FloatTensor of size 1]

<SumBackward0 object at 0x1153ba9d0>


So now, what is the derivative of this sum with respect to the first component of x? In math, we want

∂s/∂x0

Well, s knows that it was created as a sum of the tensor z. z knows that it was the sum x + y. So

s=x0+y0⏞z0+x1+y1⏞z1+x2+y2⏞z2

And so s contains enough information to determine that the derivative we want is 1!

In [17]:
# calling .backward() on any variable will run backprop, starting from it.
s.backward()
print(x.grad)
print(y.grad)
print(s.grad)

Variable containing:
 3
 3
 3
[torch.FloatTensor of size 3]

Variable containing:
 3
 3
 3
[torch.FloatTensor of size 3]

None


In [18]:
# dont know why it accumulates, but ok

#Lets have Pytorch compute the gradient, and see that we were right:
#(note if you run this block multiple times, the gradient will increment. 
#That is because Pytorch accumulates the gradient into the .grad property,
#since for many models this is very convenient.)

Understanding what is going here is apparently extremely important

In [19]:
x = torch.randn((2, 2))
y = torch.randn((2, 2))
z = x + y  # These are Tensor types, and backprop would not be possible

var_x = autograd.Variable(x, requires_grad=True)
var_y = autograd.Variable(y, requires_grad=True)
# var_z contains enough information to compute gradients, as we saw above
var_z = var_x + var_y
print(var_z.grad_fn)

var_z_data = var_z.data  # Get the wrapped Tensor object out of var_z...
# Re-wrap the tensor in a new variable
new_var_z = autograd.Variable(var_z_data)

# ... does new_var_z have information to backprop to x and y?
# NO!
print(new_var_z.grad_fn)
# And how could it?  We yanked the tensor out of var_z (that is
# what var_z.data is).  This tensor doesn't know anything about
# how it was computed.  We pass it into new_var_z, and this is all the
# information new_var_z gets.  If var_z_data doesn't know how it was
# computed, theres no way new_var_z will.
# In essence, we have broken the variable away from its past history

<AddBackward1 object at 0x1153bae50>
None


If you want the error from your loss function to backpropagate to a component of your network, you MUST NOT break the Variable chain from that component to your loss Variable. If you do, the loss will have no idea your component exists, and its parameters can’t be updated.