In [1]:
import numpy as np
class Variable:
    def __init__(self, data):
        self.data = data
        self.grad = None
        self.creator = None
        
    def set_creator(self, func):
        self.creator = func
        
    def backward(self):    
        f = self.creator
        if f is not None:
            x = f.input
            x.grad = f.backward(self.grad)
            x.backward()

# Variable 및 편의성 

In [79]:
class Variable:
    __array_priority = 200
    def __init__(self, data, name=None):
        if data is not None:
            if not isinstance(data, np.ndarray):
                raise TypeError('{}은 지원하지 않습니다.'.format(type(data)))
                
                
        self.data = data
        self.name = name
        self.grad = None
        self.creator = None
        self.generation = 0
        
    def set_creator(self, func):
        self.creator = func
        self.generation = func.generation + 1
        
    def backward(self, retain_grad=False, create_graph=False):
        
        if self.grad is None:
            self.grad = Variable(np.ones_like(self.data))
        
        funcs = []
        seen_set = set()
        
        def add_func(f):
            if f not in seen_set:
                funcs.append(f)
                seen_set.add(f)
                funcs.sort(key=lambda x: x.generation)
        
        add_func(self.creator)
        
        while funcs:
            f = funcs.pop()
            gys = [output().grad for output in f.outputs]
            gxs = f.backward(*gys)
            
            with using_config('enable_backprop', create_graph):
                gxs = f.backward(*gys)
                if not isinstance(gxs, tuple):
                    gxs = (gxs,)

                for x,gx in zip(f.inputs, gxs):
                    if x.grad is None:
                        x.grad = gx
                    else:
                        x.grad = x.grad + gx

                    if x.creator is not None:
                        add_func(x.creator)
        
        if not retain_grad:
            for y in f.outputs:
                y().grad = None
                    
    def reshape(self,*shape):
        if len(shape) == 1 and isinstance(shape[0], (tuple,list)):
            shape = shape[0]
        return reshape(self,shape)
    
    def transpose(self):
        return transpose(self)
    
    
    def cleargrad(self):
        self.grad = None
        
    def unchain(self):
        self.creator = None
        
    def unchain_backward(self):
        if self.creator is not None:
            funcs = [self.creator]
            while funcs :
                f = funcs.pop()
                for x in f.inputs:
                    if x.creator is not None:
                        funcs.append(x.creator)
                        x.unchain()
    
    @property
    def shape(self):
        return self.data.shape
    
    @property
    def ndim(self):
        return self.data.ndim
    
    @property
    def size(self):
        return self.data.size
        
    @property
    def dtype(self):
        return self.data.dtype
    
    @property
    def T(self):
        return transpose(self)
    
    def __len__(self):
        return len(self.data)
    
    def __repr__(self):
        if self.data is None:
            return 'variable(None)'
        
        p = str(self.data).replace('\n', '\n' + ' '*9)
        return 'variable('+p +')'

In [3]:
class Parameter(Variable):
    pass

In [7]:
class BroadcastTo(Function):
    def __init__(self, shape):
        self.shape = shape
        
    def forward(self,x):
        self.x_shape = x.shape
        y = np.broadcast_to(x, self.shape)
        return y
    
    def backward(self,gy):
        gx = sum_to(gy, self.x_shape)
        return gx

In [8]:
class SumTo(Function):
    def __init__(self,shape):
        self.shape = shape
        
    def forward(self,x):
        self.x_shape = x.shape
        y = utils_sum_to(x, self.shape)
        return y
    
    def backward(self, gy):
        gx = broadcast_to(gy, self.x_shape)
        return gx

In [9]:
class Reshape(Function):
    def __init__(self,shape):
        self.shape = shape
        
    def forward(self,x):
        self.x_shape = x.shape
        y = x.reshape(self.shape)
        return y
    
    def backward(self, gy):
        return reshape(gy, self.x_shape)
    
    

In [10]:
class Transpose(Function):
    def forward(self,x):
        y = np.transpose(x)
        return y
    
    def backward(self,gy):
        gx = transpose(gy)
        return gx

# 각종 연산 함수

In [11]:
import weakref

class Function:
    def __call__(self, *inputs):
        inputs = [as_variable(x) for x in inputs]
        xs = [x.data for x in inputs]
        ys = self.forward(*xs)
        if not isinstance(ys, tuple):
            ys = (ys,)
        outputs = [Variable(as_array(y)) for y in ys]
        
        if Config.enable_backprop:
            self.generation = max([x.generation for x in inputs])
            for output in outputs:
                output.set_creator(self)
            self.inputs = inputs
            self.outputs = [weakref.ref(output) for output in outputs]
        
        return outputs if len(outputs) >1 else outputs[0]
    
    def forward(self,x):
        raise NotImplementedError()
    
    def backward(self,gy):
        raise NotImplementedError()

In [71]:
class Config:
    enable_backprop = True
    train = True
    

In [13]:
class Neg(Function):
    def forward(self,x):
        return -x
    
    def backward(self,gx):
        return -gx

In [14]:
class Add(Function):
    def forward(self,x0,x1):
        self.x0_shape, self.x1_shape = x0.shape, x1.shape
        y = x0 + x1
        return y
    
    def backward(self, gy):
        gx0, gx1 = gy, gy
        if self.x0_shape != self.x1_shape:
            gx0 = sum_to(gx0, self.x0_shape)
            gx1 = sum_to(gx1, self.x1_shape)
        return gx0, gx1

In [15]:
class Sub(Function):
    def forward(self, x0, x1):
        y = x0 - x1
        return y
    
    def backward(self, gy):
        return gy, -gy

In [16]:
class Mul(Function):
    def forward(self, x0, x1):
        y = x0 * x1
        return y
    
    def backward(self,gy):
        x0, x1 = self.inputs
        return gy*x1, gy*x0

In [17]:
class Div(Function):
    def forward(Self, x0, x1):
        y = x0/ x1
        return y
    
    def backward(self, gy):
        x0, x1 = self.inputs
        gx0 = gy/x1
        gx1 = gy*(-x0/ x1 ** 2)
        return gx0, gx1

In [18]:
class Pow(Function):
    def __init__(self, c):
        self.c = c
        
    def forward(self, x):
        y = x ** self.c
        return y
    
    def backward(self, gy):
        x, = self.inputs
        c = self.c
        gx = c * x **(c-1) * gy
        return gx

In [19]:
class Square(Function):
    def forward(self,x):
        y = x ** 2
        return y
    
    def backward(self, gy):
        x, = self.inputs
        gx = 2*x*gy
        return gx

In [20]:
class Exp(Function):
    def forward(self,x):
        y = np.exp(x)
        return y
    
    def backward(self,gy):
        x, = self.inputs
        gx = np.exp(x) * gy
        return gx

In [21]:
class Sin(Function):
    def forward(self,x):
        y = np.sin(x)
        return y
        
    def backward(self,gy):
        x, = self.inputs
        gx = cos(x) * gy
        return gx

In [22]:
class Cos(Function):
    def forward(self,x):
        y = np.cos(x)
        return y
        
    def backward(self,gy):
        x, = self.inputs
        gx = -sin(x) * x
        return gx

In [23]:
class Tanh(Function):
    def forward(self,x):
        y = np.tanh(x)
        return y
    
    def backward(self,gy):
        y = self.outputs[0]()
        gx = gy *(1- y*y)
        return gx

In [24]:
class Sum(Function):
    
    def __init__(self,axis, keepdims):
        self.axis = axis
        self.keepdims = keepdims
        
        
    def forward(self,x):
        self.x_shape = x.shape
        y = x.sum(axis = self.axis, keepdims=self.keepdims)
        return y
    
    def backward(self,gy):
        gx = reshape_sum_backward(gy, self.x_shape, self.axis, self.keepdims)
        gx = broadcast_to(gy, self.x_shape)
        return gx

In [25]:
class MatMul(Function):
    def forward(self,x, W):
        y = x.dot(W)
        return y
    
    def backward(self,gy):
        x, W = self.inputs
        gx = matmul(gy, W.T)
        gW = matmul(x.T, gy)
        return gx, gW

In [26]:
class MeanSquaredError(Function):
    def forward(self, x0, x1):
        diff = x0 - x1
        y = (diff**2).sum()/len(diff)
        return y
    
    def backward(self,gy):
        x0, x1 = self.inputs
        diff = x0 - x1
        gx0 = gy * diff * (2./len(diff))
        gx1 = -gx0
        return gx0, gx1

In [27]:
class Exp(Function):
    def forward(self, x):
        y = np.exp(x)
        return y
    
    def backward(self,gy):
        gx = gy * y
        return gx

In [28]:
class GetItem(Function):
    def __init__(self, slices):
        self.slices = slices

    def forward(self, x):
        y = x[self.slices]
        return y

    def backward(self, gy):
        x, = self.inputs
        f = GetItemGrad(self.slices, x.shape)
        return f(gy)


class GetItemGrad(Function):
    def __init__(self, slices, in_shape):
        self.slices = slices
        self.in_shape = in_shape

    def forward(self, gy):
        xp = dezero.cuda.get_array_module(gy)
        gx = xp.zeros(self.in_shape, dtype=gy.dtype)

        if xp is np:
            np.add.at(gx, self.slices, gy)
        else:
            xp.scatter_add(gx, self.slices, gy)
        return gx

    def backward(self, ggx):
        return get_item(ggx, self.slices)


def get_item(x, slices):
    f = GetItem(slices)
    return f(x)

In [29]:
def neg(x):
    return Neg()(x)

def square(x):
    return Square()(x)

def exp(x):
    return Exp()(x)

def add(x0, x1):
    x1 = as_array(x1)
    return Add()(x0, x1)

def mul(x0, x1):
    x1 = as_array(x1)
    return Mul()(x0, x1)

def sub(x0,x1):
    x1 = as_array(x1)
    return Sub()(x0, x1)

def rsub(x0,x1):
    x1 = as_array(x1)
    return Sub()(x1,x0)

def div(x0, x1):
    x1 = as_array(x1)
    return Div()(x0,x1)

def rdiv(x0, x1):
    x1 = as_array(x1)
    return Div()(x1,x0)

def pow(x, c):
    return Pow(c)(x)

def sin(x):
    return Sin()(x)

def cos(x):
    return Cos()(x)

def tanh(x):
    return Tanh()(x)

def reshape(x,shape):
    if x.shape ==shape:
        return as_variable(x)
    return Reshape(shape)(x)

def transpose(x):
    return Transpose()(x)

def sum(x, axis=None, keepdims=False):
    return Sum(axis, keepdims)(x)

def broadcast_to(x, shape):
    if x.shape ==shape:
        return as_variable(x)
    return BroadcastTo(shape)(x)

def sum_to(x,shape):
    if x.shape==shape:
        return as_variable(x)
    return SumTo(shape)(x)

def matmul(x,W):
    return MatMul()(x,W)

def mean_squared_error(x0, x1):
    return MeanSquaredError()(x0,x1)

def exp(x):
    return Exp()(x)

def sigmoid(x):
    return Sigmoid()(x)

def linear(x,W,b=None):
    return Linear1()(x,W,b)

# 신경망 함수

In [30]:
class Layer:
    def __init__(self):
        self._params = set()
        
    def __setattr__(self,name, value):
        if isinstance(value, (Parameter, Layer)):
            self._params.add(name)
        super().__setattr__(name,value)
        
    def __call__(self, *inputs):
        outputs = self.forward(*inputs)
        if not isinstance(outputs, tuple):
            outputs = (outputs,)
        
        self.inputs = [weakref.ref(x) for x in inputs]
        self.outputs = [weakref.ref(y) for y in outputs]
        return output if len(outputs) > 1 else outputs[0]
    
    def forward(self, inputs):
        raise NotImplementedError()
    
    def params(self):
        for name in self._params:
            yield self.__dict__[name]
            
            if isinstance(obj, Layer) or isinstance(obj, Linear):
                yield from obj.params()
                
            else:
                yield obj
            
    def cleargrads(self):
        for param in self.params():
            param.cleargrad()

In [31]:
class Linear1(Function):
    def forward(self, x, W, b):
        y = x.dot(W)
        if b is not None:
            y += b
        return y

    def backward(self, gy):
        x, W, b = self.inputs
        gb = None if b.data is None else sum_to(gy, b.shape)
        gx = matmul(gy, W.T)
        gW = matmul(x.T, gy)
        return gx, gW, gb

In [32]:
class Linear(Layer):
    def __init__(self, out_size, nobias=False, dtype = np.float32, in_size=None):
        super().__init__()
        self.in_size = in_size
        self.out_size = out_size
        self.dtype = dtype
        
        self.W = Parameter(None, name='W')
        if self.in_size is not None:
            self._init_W()
        
        if nobias:
            self.b = None
        else:
            self.b = Parameter(np.zeros(out_size, dtype=dtype), name='b')
        
    def _init_W(self):
        I, K = self.in_size, self.out_size
        W_data = np.random.randn(I,K).astype(self.dtype) * np.sqrt(1/I)
        self.W.data = W_data
        
    def forward(self,x):
        if self.W.data is None:
            self.in_size = x.shape[1]
            self._init_W()
        
        y = linear(x, self.W, self.b)
        return y

In [33]:
class MLP(Layer):
    def __init__(self, fc_output_sizes, activation=sigmoid):
        super().__init__()
        self.activation = activation
        self.layers = []
        
        for i, out_size in enumerate(fc_output_sizes):
            layer = Linear(out_size)
            setattr(self, 'l' + str(i), layer)
            self.layers.append(layer)
            
    
    def forward(self,x):
        for l in self.layers[:-1]:
            x = self.activation(l(x))
        return self.layers[-1](x)

In [34]:
class Optimizer:
    def __init__(self):
        self.target = None
        self.hooks = []
        
    def setup(self, target):
        self.target = target
        return self
    
    def update(self):
        params = [p for p in self.target.params() if p.grad is not None]
        for f in self.hooks:
            f(params)
        
        for param in params:
            self.update_one(param)
            
    def update_one(self, param):
        raise NotImplementedError()
        
    def add_hook(self, f):
        self.hooks.append(f)

In [35]:
class SGD(Optimizer):
    def __init__(self, lr=0.01):
        super().__init__()
        self.lr = lr
        
    def update_one(self, param):
        param.data -= self.lr * param.grad.data

In [36]:
class MomentumSGD(Optimizer):
    def __init__(self, lr=0.01, momentum=0.9):
        super().__init__()
        self.lr = lr
        self.momentum = momentum
        self.vs = {}
        
    def update_one(self, param):
        v_key = id(param)
        if v_key not in self.vs:
            self.vs[v_key] = np.zeros_like(param.data)
            
        v = self.vs[v_key]
        v *= self.momentum
        v -= self.lr * param.grad.data
        param.data += v

In [37]:
def softmax_simple(x, axis=1):
    x = as_variable(x)
    y = exp(x)
    sum_y = sum(y, axis=axis, keepdims=True)
    return y/sum_y

In [38]:
def softmax_cross_entropy_simple(x,t):
    x, t = as_variable(x), as_variable(t)
    N = x.shape[0]
    
    p = softmax_simple(x)
    p = clip(p, 1e-15, 1.0)
    log_p = log(p)
    tlog_p = log_p[np.arange(N), t.data]
    y = -1 * sum(tlog_p) / N
    return y

In [39]:
class Sigmoid(Function):
    def forward(self, x):
        y = 1/(1 + np.exp(-x))
        return y

    def backward(self, gy):
        y = self.outputs[0]()
        gx = gy * y * (1 - y)
        return gx

In [40]:
class DataLoader:
    def __init__(self, dataset, batch_size, shuffle=True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.data_size = len(dataset)
        self.max_iter = math.ceil(self.data_size/batch_size)
        
        self.reset()
        
    def reset(self):
        self.iteration =0 
        if self.shuffle:
            self.index = np.random.permutation(len(self.dataset))
        else:
            self.index = np.arange(len(self.dataset))
    
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.iteration >= self.max_iter:
            self.reset()
            raise StopIteration
            
        i, batch_size = self.iteration, self.batch_size
        batch_index = self.index[i * batch_size: (i+1) * batch_size]
        batch = [self.dataset[i] for i in batch_index]
        x = np.array([example[0] for example in batch])
        t = np.array([example[1] for example in batch])
        
        self.iteration +=1
        return x, t
    
    def next(self):
        return self.__next__()

In [81]:
class SeqDataLoader(DataLoader):
    def __init__(self, dataset, batch_size, gpu=False):
        super().__init__(dataset=dataset, batch_size=batch_size, shuffle=False,
                         gpu=gpu)

    def __next__(self):
        if self.iteration >= self.max_iter:
            self.reset()
            raise StopIteration

        jump = self.data_size // self.batch_size
        batch_index = [(i * jump + self.iteration) % self.data_size for i in
                       range(self.batch_size)]
        batch = [self.dataset[i] for i in batch_index]

        xp = cuda.cupy if self.gpu else np
        x = xp.array([example[0] for example in batch])
        t = xp.array([example[1] for example in batch])

        self.iteration += 1
        return x, t

In [41]:
def accuracy(y, t):
    y, t = as_variable(y), as_variable(t)
    
    pred = y.data.argmax(axis=1).reshape(t.shape)
    result = (pred == t.data)
    acc = result.mean()
    return Variable(as_array(acc))

In [42]:
class Dataset:
    def __init__(self, train=True, transform=None, target_transform=None):
        self.train = train
        self.transform = transform
        self.target_transform = target_transform
        if self.transform is None:
            self.transform = lambda x:x
        if self.target_transform is None:
            self.target_transform = lambda x:x
            
        self.data = None
        self.label = None
        self.prepare()
        
    def __getitem__(self, index):
        assert np.isscalar(index)
        if self.label is None:
            return self.transform(self.data[index]), None
        else:
            return self.transform(self.data[index]), self.target_transform(self.label[index])
        
    def __len__(self):
        return len(self.data)
    
    def prepare(self):
        pass

In [43]:
class BigData(Dataset):
    def __getitem__(index):
        x = np.load('data/{}.npy'.format(index))
        t = np.load('label/{}.npy'.format(index))
        return x, t
    
    def __len__():
        return 1000000

In [44]:
import math

def my_sin(x, threshold=0.0001):
    y = 0
    for i in range(100000):
        c = (-1)**i/math.factorial(2*i + 1)
        t = c*x **(2*i + 1)
        y = y + t
        if abs(t.data) < threshold:
            break
    return y

In [45]:
Variable.__mul__ = mul
Variable.__add__ = add
Variable.__radd__ = add
Variable.__rmul__ = mul
Variable.__neg__ = neg
Variable.__sub__ = sub
Variable.__rsub__= rsub
Variable.__truediv__ = div
Variable.__rtruediv__ = rdiv
Variable.__pow__ = pow

In [46]:
def as_array(x):
    if np.isscalar(x):
        return np.array(x)
    return x

def as_variable(obj):
    if isinstance(obj, Variable):
        return obj
    return Variable(obj)

In [72]:
import contextlib

@contextlib.contextmanager
def using_config(name, value):
    old_value = getattr(Config, name)
    setattr(Config, name, value)
    try:
        yield
    
    finally:
        setattr(Config, name, old_value)

def test_mode():
    return using_config('train', False)
        

def no_grad():
    return using_config('enable_backprop', False)

In [48]:
def reshape_sum_backward(gy, x_shape, axis, keepdims):
    """Reshape gradient appropriately for dezero.functions.sum's backward.
    Args:
        gy (dezero.Variable): Gradient variable from the output by backprop.
        x_shape (tuple): Shape used at sum function's forward.
        axis (None or int or tuple of ints): Axis used at sum function's
            forward.
        keepdims (bool): Keepdims used at sum function's forward.
    Returns:
        dezero.Variable: Gradient variable which is reshaped appropriately
    """
    ndim = len(x_shape)
    tupled_axis = axis
    if axis is None:
        tupled_axis = None
    elif not isinstance(axis, tuple):
        tupled_axis = (axis,)

    if not (ndim == 0 or tupled_axis is None or keepdims):
        actual_axis = [a if a >= 0 else a + ndim for a in tupled_axis]
        shape = list(gy.shape)
        for a in sorted(actual_axis):
            shape.insert(a, 1)
    else:
        shape = gy.shape

    gy = gy.reshape(shape)  # reshape
    return gy

In [49]:
def utils_sum_to(x, shape):
    """Sum elements along axes to output an array of a given shape.
    Args:
        x (ndarray): Input array.
        shape:
    Returns:
        ndarray: Output array of the shape.
    """
    ndim = len(shape)
    lead = x.ndim - ndim
    lead_axis = tuple(range(lead))

    axis = tuple([i + lead for i, sx in enumerate(shape) if sx == 1])
    y = x.sum(lead_axis + axis, keepdims=True)
    if lead > 0:
        y = y.squeeze(lead_axis)
    return y

In [73]:
def dropout(x, dropout_ratio = 0.5):
    x = as_variable(x)
    
    if Config.train:
        mask = np.random.rand(*x.shape)> dropout_ratio
        scale = np.array(1.0 - dropout_ratio).astype(x.dtype)
        y = x * mask/ scale
        return y
    
    else:
        return x

In [74]:
class Im2col(Function):
    def __init__(self, kernel_size, stride, pad, to_matrix):
        super().__init__()
        self.input_shape = None
        self.kernel_size = kernel_size
        self.stride = stride
        self.pad = pad
        self.to_matrix = to_matrix

    def forward(self, x):
        self.input_shape = x.shape
        y = im2col_array(x, self.kernel_size, self.stride, self.pad,
                         self.to_matrix)
        return y

    def backward(self, gy):
        gx = col2im(gy, self.input_shape, self.kernel_size, self.stride,
                    self.pad, self.to_matrix)
        return gx


def im2col(x, kernel_size, stride=1, pad=0, to_matrix=True):
    """Extract patches from an image based on the filter.
    Args:
        x (`dezero.Variable` or `ndarray`): Input variable of shape
            `(N, C, H, W)`
        kernel_size (int or (int, int)): Size of kernel.
        stride (int or (int, int)): Stride of kernel.
        pad (int or (int, int)): Spatial padding width for input arrays.
        to_matrix (bool): If True the `col` will be reshaped to 2d array whose
            shape is `(N*OH*OW, C*KH*KW)`
    Returns:
        `dezero.Variable`: Output variable. If the `to_matrix` is False, the
            output shape is `(N, C, KH, KW, OH, OW)`, otherwise
            `(N*OH*OW, C*KH*KW)`.
    Notation:
    - `N` is the batch size.
    - `C` is the number of the input channels.
    - `H` and `W` are the height and width of the input image, respectively.
    - `KH` and `KW` are the height and width of the filters, respectively.
    - `SH` and `SW` are the strides of the filter.
    - `PH` and `PW` are the spatial padding sizes.
    - `OH` and `OW` are the the height and width of the output, respectively.
    """
    y = Im2col(kernel_size, stride, pad, to_matrix)(x)
    return y

In [75]:
def get_conv_outsize(input_size, kernel_size, stride, pad):
    return (input_size + pad * 2 - kernel_size) // stride + 1


def pair(x):
    if isinstance(x, int):
        return (x, x)
    elif isinstance(x, tuple):
        assert len(x) == 2
        return x
    else:
        raise ValueError

In [76]:
class Conv2d(Function):
    def __init__(self, stride=1, pad=0):
        super().__init__()
        self.stride = pair(stride)
        self.pad = pair(pad)

    def forward(self, x, W, b):
        xp = cuda.get_array_module(x)

        KH, KW = W.shape[2:]
        col = im2col_array(x, (KH, KW), self.stride, self.pad, to_matrix=False)

        y = xp.tensordot(col, W, ((1, 2, 3), (1, 2, 3)))
        if b is not None:
            y += b
        y = xp.rollaxis(y, 3, 1)
        # y = np.transpose(y, (0, 3, 1, 2))
        return y

    def backward(self, gy):
        x, W, b = self.inputs
        # ==== gx ====
        gx = deconv2d(gy, W, b=None, stride=self.stride, pad=self.pad,
                      outsize=(x.shape[2], x.shape[3]))
        # ==== gW ====
        gW = Conv2DGradW(self)(x, gy)
        # ==== gb ====
        gb = None
        if b.data is not None:
            gb = gy.sum(axis=(0, 2, 3))
        return gx, gW, gb


def conv2d(x, W, b=None, stride=1, pad=0):
    return Conv2d(stride, pad)(x, W, b)

In [77]:
def pooling_simple(x, kernel_size, stride=1, pad=0):
    x = as_variable(x)

    N, C, H, W = x.shape
    KH, KW = pair(kernel_size)
    PH, PW = pair(pad)
    SH, SW = pair(stride)
    OH = get_conv_outsize(H, KH, SH, PH)
    OW = get_conv_outsize(W, KW, SW, PW)

    col = im2col(x, kernel_size, stride, pad, to_matrix=True)
    col = col.reshape(-1, KH * KW)
    y = col.max(axis=1)
    y = y.reshape(N, OH, OW, C).transpose(0, 3, 1, 2)
    return y

In [78]:
class RNN(Layer):
    def __init__(self, hidden_size, in_size=None):
        """An Elman RNN with tanh.
        Args:
            hidden_size (int): The number of features in the hidden state.
            in_size (int): The number of features in the input. If unspecified
            or `None`, parameter initialization will be deferred until the
            first `__call__(x)` at which time the size will be determined.
        """
        super().__init__()
        self.x2h = Linear(hidden_size, in_size=in_size)
        self.h2h = Linear(hidden_size, in_size=in_size, nobias=True)
        self.h = None

    def reset_state(self):
        self.h = None

    def forward(self, x):
        if self.h is None:
            h_new = F.tanh(self.x2h(x))
        else:
            h_new = F.tanh(self.x2h(x) + self.h2h(self.h))
        self.h = h_new
        return h_new

In [80]:
class LSTM(Layer):
    def __init__(self, hidden_size, in_size=None):
        super().__init__()

        H, I = hidden_size, in_size
        self.x2f = Linear(H, in_size=I)
        self.x2i = Linear(H, in_size=I)
        self.x2o = Linear(H, in_size=I)
        self.x2u = Linear(H, in_size=I)
        self.h2f = Linear(H, in_size=H, nobias=True)
        self.h2i = Linear(H, in_size=H, nobias=True)
        self.h2o = Linear(H, in_size=H, nobias=True)
        self.h2u = Linear(H, in_size=H, nobias=True)
        self.reset_state()

    def reset_state(self):
        self.h = None
        self.c = None

    def forward(self, x):
        if self.h is None:
            f = F.sigmoid(self.x2f(x))
            i = F.sigmoid(self.x2i(x))
            o = F.sigmoid(self.x2o(x))
            u = F.tanh(self.x2u(x))
        else:
            f = F.sigmoid(self.x2f(x) + self.h2f(self.h))
            i = F.sigmoid(self.x2i(x) + self.h2i(self.h))
            o = F.sigmoid(self.x2o(x) + self.h2o(self.h))
            u = F.tanh(self.x2u(x) + self.h2u(self.h))

        if self.c is None:
            c_new = (i * u)
        else:
            c_new = (f * self.c) + (i * u)

        h_new = o * F.tanh(c_new)

        self.h, self.c = h_new, c_new
        return h_new

In [50]:
x = Variable(np.array(2.0))
y = add(x, x)
print(y.data)
y.backward(retain_grad=True)
print(x.grad)
print(y.grad)

4.0
variable(2.0)
variable(1.0)


In [51]:
x = Variable(np.array(2.0))
a = square(x)
y = add(square(a),square(a))
y.backward()

print(y.data)
print(x.grad)

32.0
variable(64.0)


In [52]:
def sphere(x,y):
    z = x**2 + y**2
    return z

x = Variable(np.array(1.0))
y = Variable(np.array(1.0))
z = sphere(x, y)
z.backward()
print(x.grad, y.grad)

variable(2.0) variable(2.0)


In [53]:
def matyas(x, y):
    z = 0.26 * (x **2 + y **2) - 0.48 * x * y
    return z

x = Variable(np.array(1.0))
y = Variable(np.array(1.0))
z = matyas(x, y)
z.backward()
print(x.grad, y.grad)

variable(0.040000000000000036) variable(0.040000000000000036)


In [54]:
def goldstein(x, y):
    z = (1 + (x + y + 1)**2 * (19 - 14*x + 3*x**2 - 14*y + 6*x*y + 3*y**2)) * \
        (30 + (2*x - 3*y)**2 * (18 - 32*x + 12*x**2 + 48*y - 36*x*y + 27*y**2))
    return z


x = Variable(np.array(1.0))
y = Variable(np.array(1.0))
z = goldstein(x, y) 
z.backward()
print(x.grad, y.grad)

variable(-5376.0) variable(8064.0)


In [55]:
x = Variable(np.array(np.pi/3))
y = sin(x)
y.backward()

print(y.data)
print(x.grad)

0.8660254037844386
variable(0.5000000000000001)


In [56]:
x = Variable(np.array(np.pi/4))
y = my_sin(x)
y.backward()

print(y.data)
print(x.grad)

0.7071064695751781
variable(0.7071032148228457)


In [57]:
def f(x):
    y = x **4 - 2*x**2
    return y

x = Variable(np.array(2.0))
y = f(x)
y.backward(create_graph=True)
print(x.grad)

gx = x.grad
x.cleargrad()
gx.backward()
print(x.grad)

variable(24.0)
variable(44.0)


In [58]:
iters = 10

for i in range(iters):
    print(i,x)
    
    y = f(x)
    x.cleargrad()
    y.backward(create_graph=True)
    
    gx = x.grad
    x.cleargrad()
    gx.backward()
    gx2 = x.grad
    
    x.data -=gx.data/gx2.data

0 variable(2.0)
1 variable(1.4545454545454546)
2 variable(1.1510467893775467)
3 variable(1.0253259289766978)
4 variable(1.0009084519430513)
5 variable(1.0000012353089454)
6 variable(1.000000000002289)
7 variable(1.0)
8 variable(1.0)
9 variable(1.0)


In [59]:
x = Variable(np.array(1.0))
y = sin(x)
y.backward(create_graph=True)

for i in range(3):
    gx = x.grad
    x.cleargrad()
    gx.backward(create_graph=True)
    print(x.grad)

variable(-0.8414709848078965)
variable(-1.3817732906760363)
variable(-2.4623779024123156)


In [60]:
x = Variable(np.array([[1,2,3],[4,5,6]]))
y = reshape(x,(6,))
y.backward(retain_grad=True)
print(x.grad)

variable([[1 1 1]
          [1 1 1]])


In [61]:
x = Variable(np.random.randn(1,2,3))
y = x.reshape((2,3))
y

variable([[ 1.66323476  1.0134988   1.13236135]
          [-1.34233188  0.54592552  0.3996601 ]])

In [62]:
y = x.reshape(2,3)

In [63]:
y

variable([[ 1.66323476  1.0134988   1.13236135]
          [-1.34233188  0.54592552  0.3996601 ]])

In [64]:
x = Variable(np.array([[1,2,3],[4,5,6]]))
y = sum(x,axis=0)
print(y.backward())
x.grad

None


variable([[1 1 1]
          [1 1 1]])

In [65]:
x0 = Variable(np.array([1,2,3]))
x1 = Variable(np.array([10]))
y = x0 + x1
print(y)

y.backward()
print(x1.grad)

variable([11 12 13])
variable([3])


In [66]:
x = Variable(np.random.randn(2,3))
W = Variable(np.random.randn(3,4))
y = matmul(x,W)
y.backward()

print(x.grad.shape)
print(W.grad.shape)

(2, 3)
(3, 4)


In [67]:
np.random.seed(0)
x = np.random.rand(100,1)
y = 5 + 2*x + np.random.rand(100,1)

x, y = Variable(x), Variable(y)

W = Variable(np.zeros((1,1)))
b = Variable(np.zeros(1))

def predict(x):
    y = matmul(x, W) + b
    return y

lr = 0.1
iters = 100

for i in range(iters):
    y_pred = predict(x)
    loss = mean_squared_error(y, y_pred)
    
    W.cleargrad()
    b.cleargrad()
    loss.backward()
    
    W.data -= lr * W.grad.data
    b.data -= lr * b.grad.data
    
    print(W,b,loss)

variable([[0.64433458]]) variable([1.29473389]) variable(42.296340129442335)
variable([[1.12672345]]) variable([2.26959351]) variable(23.97380754378544)
variable([[1.48734571]]) variable([3.00386712]) variable(13.609686745040522)
variable([[1.75641886]]) variable([3.557186]) variable(7.747049961219976)
variable([[1.95666851]]) variable([3.97439789]) variable(4.43057410592155)
variable([[2.10518573]]) variable([4.28923203]) variable(2.554280381353593)
variable([[2.21482401]]) variable([4.52705574]) variable(1.492599869047195)
variable([[2.29524981]]) variable([4.70694745]) variable(0.8916952181756939)
variable([[2.35373273]]) variable([4.84325585]) variable(0.5514270962227455)
variable([[2.39573972]]) variable([4.9467725]) variable(0.3585915308319281)
variable([[2.425382]]) variable([5.02561369]) variable(0.24915731977561134)
variable([[2.44575118]]) variable([5.08588371]) variable(0.1869065876539789)
variable([[2.45917205]]) variable([5.13217364]) variable(0.1513533629631488)
variable(

In [68]:
x = np.random.rand(100,1)
y = np.sin(2*np.pi*x) + np.random.rand(100,1)

x, y = Variable(x), Variable(y)

I, H, K = 1, 10, 1
W1 = Variable(0.01 * np.random.randn(I,H))
b1 = Variable(np.zeros(H))
W2 = Variable(0.01 * np.random.randn(H,K))
b2 = Variable(np.zeros(K))

def predict(x):
    y = linear(x, W1, b1)
    y = sigmoid(y)
    y = linear(y, W2, b2)
    return y

lr = 0.2
iters = 10000

for i in range(iters):
    y_pred = predict(x)
    loss = mean_squared_error(y,y_pred)
    
    W1.cleargrad()
    b1.cleargrad()
    W2.cleargrad()
    b2.cleargrad()
    loss.backward()
    
    W1.data -= lr*W1.grad.data
    b1.data -= lr*b1.grad.data
    W2.data -= lr*W2.grad.data
    b2.data -= lr*b2.grad.data
    if i%1000==0:
        print(W1,loss)

variable([[ 0.00693193 -0.00160583 -0.00140474  0.01070093 -0.01124242 -0.00739062
           -0.00395771  0.00091413 -0.00043655 -0.0026565 ]]) variable(0.8709474113416129)
variable([[-0.64077155 -1.03587071 -0.72578173 -0.0678613  -1.63545598 -0.93475781
           -0.63270238 -0.81444226 -0.95607165 -2.17288875]]) variable(0.26774291816856405)
variable([[-0.57115267 -0.92980266 -0.64104352 -0.18421192 -1.66433439 -0.82999593
           -0.56479673 -0.71823363 -0.85056993 -2.98371048]]) variable(0.2602933661712346)
variable([[-0.49705276 -0.7632952  -0.54303209 -0.3257606  -1.48460807 -0.68277288
           -0.49329052 -0.59763061 -0.69895922 -4.54845234]]) variable(0.23650229278109183)
variable([[-0.53810505 -0.89373832 -0.59811249 -0.49499416 -1.68524225 -0.78882893
           -0.53421691 -0.67250412 -0.81010736 -6.73330914]]) variable(0.18630482333010762)
variable([[-0.78959745 -1.48006198 -0.92891674 -0.78066503 -2.44739261 -1.30579457
           -0.780697   -1.08688537 -1.342582

In [69]:
np.random.seed(0)
x = np.random.rand(100,1)
y = np.sin(2*np.pi*x) + np.random.rand(100,1)

x, y = Variable(x), Variable(y)

lr = 0.2
max_iter = 10000
hidden_size = 10

model = MLP((hidden_size,1))
optimizer = SGD(lr)
optimizer.setup(model)

for i in range(max_iter):
    y_pred = model(x)
    loss = mean_squared_error(y, y_pred)
    
    
    
    model.cleargrads()
    loss.backward()
    
    optimizer.update()
    if i%1000==0:
        print(y_pred)
        print(loss)

AttributeError: 'Linear' object has no attribute 'cleargrad'