In [1]:
import torch

## 2.1 Data Manipulation

In [2]:
x = torch.arange(12, dtype = torch.float32)
x

In [3]:
x.shape

In [4]:
x.numel()

In [5]:
X = x.reshape(3, 4)
X

In [6]:
torch.randn(3,4)

In [7]:
torch.tensor([[2, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]])

In [8]:
x = torch.tensor([1.0, 2, 4, 8])
y = torch.tensor([2, 2, 2, 2])
x + y, x - y, x * y, x / y, x ** y

In [9]:
torch.exp(x)

In [10]:
X = torch.arange(12, dtype = torch.float32).reshape((3, 4))
Y = torch.tensor([[2.0, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]])
torch.cat((X, Y), dim = 0), torch.cat((X, Y), dim = 1) # dim = 0 세로로 쌓기, dim = 1 가로로 쌓기

In [11]:
X == Y

In [12]:
X.sum()

In [13]:
a = torch.arange(3).reshape((3, 1))
b = torch.arange(2).reshape((1, 2))
a, b

In [14]:
a + b

In [15]:
X

In [16]:
X[-1], X[1:3]

In [17]:
X[1, 2] = 9
X

In [18]:
X[0:2, :] = 12
X

In [19]:
before = id(Y)
Y = Y + X
id(Y) == before

In [20]:
Z = torch.zeros_like(Y)
print('id(Z):', id(Z))
Z[:] = X + Y
print('id(Z):', id(Z))

In [21]:
before = id(X)
X += Y
id(X) == before

In [22]:
# Converting to a NumPy tensor (ndarray), or vice versa is easy
A = X.numpy()
B = torch.from_numpy(A)
type(A), type(B)

In [23]:
# size-1 tensor to a Python scalar
a = torch.tensor([3.5])
a, a.item(), float(a), int(a)

## 2.2 Data Preprocessing

In [24]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok = True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')

In [25]:
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n') # Column names
    f.write('NA,Pave,127500\n') # Each row represents a data example
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [26]:
import pandas as pd

data = pd.read_csv(data_file)
print(data)

In [27]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = inputs.fillna(inputs.mean())
print(inputs)

In [28]:
inputs = pd.get_dummies(inputs, dummy_na = True)
print(inputs)

## 2.3 Linear Algebra

In [29]:
import torch

X, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X, y

In [30]:
import torch

x = torch.tensor(3.0)
y = torch.tensor(2.0)

x + y, x * y, x / y, x ** y

In [31]:
x = torch.arange(4)
x

In [32]:
x[3]

In [33]:
len(x)

In [34]:
x.shape

In [35]:
# dimension : dimensionality of a vector or an axis to refer to its length, i.e. the number of elements of a vector or an axis
# in this sense, the dimensionality of some axis of a tensor will be the length of that axis.

In [36]:
A = torch.arange(20).reshape(5, 4) # rows, columns
A

In [37]:
A.T

In [38]:
B = torch.tensor([[1, 2, 3], [2, 0, 4], [3, 4, 5]])
B

In [39]:
B == B.T

In [40]:
X = torch.arange(24).reshape(2, 3, 4)
X

In [41]:
A = torch.arange(20, dtype = torch.float32).reshape(5, 4)
B = A.clone() # Assign a copy of `A` to `B` by allocating new memory
A, A+B

In [42]:
A * B # Hadamard product

In [43]:
a = 2
X = torch.arange(24).reshape(2, 3, 4)
a + X, (a * X).shape

In [44]:
x = torch.arange(4, dtype = torch.float32)
x, x.sum()

In [45]:
A.shape, A.sum()

In [46]:
# to reduce the row dimension (axis 0) by summing up elements of all the rows, we specify axis = 0 when invoking the function
A_sum_axis0 = A.sum(axis = 0)
A_sum_axis0, A_sum_axis0.shape

In [47]:
A_sum_axis1 = A.sum(axis = 1)
A_sum_axis1, A_sum_axis1.shape

In [48]:
A.sum(axis=[0, 1]) # Same as `A.sum()`

In [49]:
A.mean(), A.sum() / A.numel() # numel : total number of elements

In [50]:
A.mean(axis = 0), A.sum(axis = 0) / A.shape[0]

In [51]:
sum_A = A.sum(axis = 1, keepdims = True)
sum_A

In [52]:
A / sum_A

In [53]:
A.cumsum(axis = 0)

In [54]:
y = torch.ones(4, dtype = torch.float32)
x, y, torch.dot(x, y)

In [55]:
torch.sum(x * y)

In [56]:
A.shape, x.shape, torch.mv(A, x) # expressin matrix-vector products in code with tensors, we use the mv function
# note that the column dimension of A (its length along axis 1) must be the same as the dimension of x (its length)

In [57]:
B = torch.ones(4, 3)
torch.mm(A, B)
# expression matrix-matrix products in code with tensors

In [58]:
# Informally, the norm of a vector tells us how big a vector is.
# The notion of size under consideration here concerns not dimensionality but rather the magnitude of the components
# In fact, the Euclidean distane is a norm : specifically it is the L2 norm.
# The L2 norm of x is the square root of the sum of the squares of the vector elements

In [59]:
u = torch.tensor([3.0, -4.0])
torch.norm(u)

In [60]:
# You will also frequently encounter the L1 norm, which is expressed as the sum of the absolute values of the vector elements:
# As compared with the L2 norm, it is less influenced by outliers. To calculate the L1 norm, we compose the absolute value function with a sum over the elements

In [61]:
torch.abs(u).sum()

In [62]:
torch.norm(torch.ones((4, 9)))

## 2.4 Calculs

In [63]:
%matplotlib inline
import numpy as np
from IPython import display

In [64]:
!pip install d2l
from d2l import torch as d2l

In [65]:
def f(x):
    return 3 * x ** 2 - 4 * x

In [66]:
def numerical_lim(f, x, h):
    return (f(x+h) - f(x)) / h

h = 0.1
for i in range(5):
    print(f'h={h:.5f}, numerical limit = {numerical_lim(f, 1, h):.5f}')
    h *= 0.1

following codes are for figure configurations

In [67]:
def use_svg_display(): #@save
    """Use the svg format to display a plot in Jupyter."""
    display.set_matplotlib_formats('svg')

In [68]:
def set_figsize(figsize=(3.5, 2.5)): #@save
    """Set the figure size for matplotlib."""
    use_svg_display()
    d2l.plt.rcParams['figure.figsize'] = figsize

In [69]:
#@save
def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
    """Set the axes for matplotlib."""
    axes.set_xlabel(xlabel)
    axes.set_ylabel(ylabel)
    axes.set_xscale(xscale)
    axes.set_yscale(yscale)
    axes.set_xlim(xlim)
    axes.set_ylim(ylim)
    if legend:
        axes.legend(legend)
    axes.grid()

In [70]:
#@save
def plot(X, Y = None, xlabel=None, ylabel = None, legend = None, xlim = None, ylim = None, xscale = 'linear', yscale = 'linear', fmts = ('-', 'm--', 'g-.', 'r:'),
        figsize = (3.5, 2.5), axes = None):
    """Plot data points."""
    if legend is None:
        legend = []
    
    set_figsize(figsize)
    axes = axes if axes else d2l.plt.gca()
    
    # Return True if `X` (tensor or list) has 1 axis
    def has_one_axis(X):
        return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list)
               and not hasattr(X[0], "__len__") )
    
    if has_one_axis(X):
        X = [X]
    if Y is None:
        X, Y = [[]] * len(X), X
    elif has_one_axis(Y):
        Y = [Y]
    if len(X) != len(Y):
        X = X * len(Y)
    axes.cla()
    for x, y, fmt in zip(X, Y, fmts):
        if len(x):
            axes.plot(x, y, fmt)
        else:
            axes.plot(y, fmt)
    set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)

In [71]:
X = np.arange(0, 3, 0.1)
plot(x, [f(x), 2 * x - 3], 'x', 'f(x)', legend = ['f(x)', 'Tangent line (x=1)'])

## 2.5 Automatic Differentiation
differentiation is a crucial step in nearly all deep learning optimization algorithms.
Automatic differentiation enables the system to subsequently backpropagate gradients
Here, backpropagate simply means to trace through the computational graph, filling in the partial derivatives with respect to each parameter

In [72]:
import torch

x = torch.arange(4.0)
x

In [73]:
x.requires_grad_(True) # Same as `x = torch.arange(4.0, requires_grad = True)`
x.grad # the default value is None

In [74]:
y = 2 * torch.dot(x, x)
y

In [75]:
y.backward()
x.grad

In [76]:
x.grad == 4 * x

In [77]:
# PyTorch accumulates the gradient in default, we need to clear the previous values
x.grad.zero_()
y = x.sum()
y.backward()
x.grad

In [78]:
# backward for non-scalar variables
# Technically, when y is not a scalar, the most natural interpretation of the differentiation of a vector y with respect to a vector x is a matrix.
# For higher-order and higher-dimensional y and x, the differentiation result could be a high-order tensor.
# more often when we are calling backward on a vector, we are trying to calculate the derivatives of the loss functions for each constituent of a batch of training examples
# Here, our intent is not to calculate the differentiation matrix but rather the sum of the partial derivatives computed individually for each example in the batch

In [79]:
# Invoking `backward` on a non-scalar requires passing in a `gradient` argument which specifies the gradient of the differentiated function w.r.t `self`.
# In our case, we simply want to sum the partial derivatives, so passing in a gradient of ones is appropriate
x.grad.zero_()
y = x * x
# y.backward(torch.ones(len(x))) equivalent to the below
y.sum().backward()
x.grad

In [80]:
# for example, say that y was calculated as a function of x, and that subsequently z was calculated as a function of both y and x.
# Now, imagine that we wanted to calculate the gradient of z with respect to x, but wanted for some reason to treat y as a constant, and only take into account the role that x played after y was calculated
# Here, we can detach y to return a new variable u that has the same value as y but discards any information about how y was computed in the computational graph.
# In other words, the gradient will not flow backwards through u to x.
# Thus, the following backpropagation function computes the partial derivaties of z = u * x with respect to x while treating u as a constant,
# instead of the partial derivative of z = x * x * x with respect to x.

In [81]:
x.grad.zero_()
y = x * x
u = y.detach()
z = u * x

z.sum().backward()
x.grad == u

In [82]:
x.grad.zero_()
y.sum().backward()
x.grad == 2 * x

In [83]:
def f(a):
    b = a * 2
    while b.norm() < 1000:
        b = b * 2
    if b.sum() > 0:
        c = b
    else:
        c = 100 * b
    return c

In [84]:
a = torch.randn(size = (), requires_grad = True)
d = f(a)
d.backward()

In [85]:
a.grad == d / a

## 2.6 Probability

In [86]:
# machine learning is all about making predictions

In [87]:
!pip install d2l
%matplotlib inline
import torch
from torch.distributions import multinomial
from d2l import torch as d2l

In [88]:
fair_probs = torch.ones([6]) / 6  # 1/6
multinomial.Multinomial(1, fair_probs).sample()

In [89]:
multinomial.Multinomial(10, fair_probs).sample()

In [90]:
# Store the results as 32-bit floats for division
counts = multinomial.Multinomial(1000, fair_probs).sample()
counts / 1000 # Relative frequency as the estimate

In [91]:
counts = multinomial.Multinomial(10, fair_probs).sample((500,))
cum_counts = counts.cumsum(dim=0)
estimates = cum_counts / cum_counts.sum(dim=1, keepdims=True)
d2l.set_figsize((6, 4.5))
for i in range(6):
    d2l.plt.plot(estimates[:, i].numpy(), label=("P(die=" + str(i + 1) + ")"))
d2l.plt.axhline(y=0.167, color='black', linestyle='dashed')
d2l.plt.gca().set_xlabel('Groups of experiments')
d2l.plt.gca().set_ylabel('Estimated probability')
d2l.plt.legend();

## 2.7 Documentations

In [92]:
import torch

print(dir(torch.distributions))

In [93]:
help(torch.ones)