In [4]:
import torch
import torch.nn as nn
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torchsummary import summary
from torchvision import datasets
from torchvision import transforms
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader

In [2]:
import numpy as np
from typing import List, NamedTuple, Callable, Optional, Union

In [97]:
##########Tensor################

class Dependency(NamedTuple):
    tensor: 'Tensor'
    grad_fn: Callable[[np.ndarray], np.ndarray]


Arrayable = Union[float, list, np.ndarray]

def ensure_array(arrayable: Arrayable) -> np.ndarray:
    if isinstance(arrayable, np.ndarray):
        return arrayable
    else:
        return np.array(arrayable)

Tensorable = Union[float, 'Tensor', np.ndarray]

def ensure_tensor(tensorable: Tensorable) -> 'Tensor':
    if isinstance(tensorable, Tensor):
        return tensorable
    else:
        return Tensor(tensorable)

class Tensor:

    def __init__(
            self,
            data : np.ndarray,
            requires_grad: bool = False,
            depends_on : List[Dependency] = None) -> None:

        """
        Args:
            data: value of tensor (numpy.ndarray)
            requires_grad: if tensor needs grad (bool)
            depends_on: list of dependencies
        """
        self._data = ensure_array(data)
        self.requires_grad = requires_grad
        self.depends_on = depends_on

        if not depends_on:
            self.depends_on = []

        self.shape = self._data.shape
        self.grad : Optional['Tensor'] = None

        if self.requires_grad:
            self.zero_grad()

    @property
    def data(self):
        return self._data

    @data.setter
    def data(self, new_data: np.ndarray) -> None:
        self._data = new_data
        self.grad = None

    def zero_grad(self) -> None:
        self.grad = Tensor(np.zeros_like(self.data, dtype=np.float64))

    def __repr__(self) -> str:
        return f"Tensor({self.data}, requires_grad={self.requires_grad})"

    def sum(self) -> 'Tensor':
        # TODO: implement sum over tensor elems
        # Hint use _tensor_sum function
        return _tensor_sum(self)

    def log(self) -> 'Tensor':
        # TODO: implement log
        # Hint use _tensor_log function
        return _tensor_log(self)

    def exp(self) -> 'Tensor':
        # TODO: implement exp
        # Hint use _tensor_exp function
        return _tensor_exp(self)

    def __add__(self, other) -> 'Tensor':
        # Done ( Don't change )
        # Hint use _add function
        # self + other
        return _add(self, ensure_tensor(other))

    def __radd__(self, other) -> 'Tensor':
        # TODO: implement radd
        # Hint use _add function
        # other + self
        return _add(ensure_tensor(other), self)

    def __iadd__(self, other) -> 'Tensor':
        # TODO: implement inc add
        # Hint use _add function
        # self += other
        new_tensor = self + other
        self = Tensor(new_tensor.data, requires_grad=new_tensor.requires_grad, depends_on=new_tensor.depends_on)
        return self

    def __sub__(self, other) -> 'Tensor':
        # TODO: implement sub
        # Hint use _sub function
        # self - other
        return _sub(self, ensure_tensor(other))

    def __rsub__(self, other) -> 'Tensor':
        # TODO: implement rsub
        # Hint use _sub function
        # other - self
        return _sub(ensure_tensor(other), self)

    def __isub__(self, other) -> 'Tensor':
        # TODO: implement inc sub
        # Hint use _sub function
        # self -= other
        new_tensor = self - other
        self = Tensor(new_tensor.data, requires_grad=new_tensor.requires_grad, depends_on=new_tensor.depends_on)
        return self

    def __mul__(self, other) -> 'Tensor':
        # TODO: implement elemnet-wise mul
        # Hint use _mul function
        # self * other
        return _mul(self, ensure_tensor(other))

    def __rmul__(self, other) -> 'Tensor':
        # TODO: implement elemnet-wise rmul
        # Hint use _mul function
        # other * self
        return _mul(ensure_tensor(other), self)

    def __imul__(self, other) -> 'Tensor':
        # TODO: implement elemnet-wise inc mul
        # Hint use _mul function
        # self *= other
        new_tensor = self * other
        self = Tensor(new_tensor.data, requires_grad=new_tensor.requires_grad, depends_on=new_tensor.depends_on)
        return self

    def __matmul__(self, other) -> 'Tensor':
        # TODO: implement matrix mul
        # Hint use _matmul function
        # self @ other
        return _matmul(self, ensure_tensor(other))

    def __pow__(self, power: float):
        # TODO: implement power
        # Hint use _tensor_pow function
        # self ** power
        return _tensor_pow(self, power)

    def __getitem__(self, idcs):
        # TODO: implement getitem [:]
        # Hint use _tensor_slice function
        return _tensor_slice(self, idcs)

    def __neg__(self, idcs):
        # TODO: implement neg (-)
        # Hint use -_tensor_neg function
        neg = _tensor_neg(self)
        return _tensor_slice(neg, idcs)



    def backward(self, grad: 'Tensor' = None) -> None:
        if grad is None:
            if self.shape == ():
                grad = Tensor(1.0)
            else:
                raise RuntimeError("grad must be specified for non-0-tensor")
        self.grad.data = self.grad.data + grad.data
        for dependency in self.depends_on:
            backward_grad = dependency.grad_fn(grad.data)
            dependency.tensor.backward(Tensor(backward_grad))


def _tensor_sum(t: Tensor) -> Tensor:
    data = t.data.sum()
    req_grad = t.requires_grad

    if req_grad:
        def grad_fn(grad: np.ndarray):
            return grad * np.ones_like(t.data)

        depends_on = [Dependency(t, grad_fn)]

    else:
        depends_on = []

    return Tensor(data=data, requires_grad=req_grad, depends_on=depends_on)

def _tensor_log(t: Tensor) -> Tensor:
    # TODO
    data = np.log(t.data)
    req_grad = t.requires_grad

    if req_grad:
        def grad_fn(grad: np.ndarray):
            return grad * (1 / t.data) * np.ones_like(t.data)

        depends_on = [Dependency(t, grad_fn)]

    else:
        depends_on = []

    return Tensor(data=data, requires_grad=req_grad, depends_on=depends_on)

def _tensor_exp(t: Tensor) -> Tensor:
    # TODO
    # print("oooooooooo",t.data,"ppppppppppppppp")
    data = np.exp(t.data)
    req_grad = t.requires_grad

    if req_grad:
        def grad_fn(grad: np.ndarray):
            return grad * data * np.ones_like(t.data)

        depends_on = [Dependency(t, grad_fn)]

    else:
        depends_on = []

    return Tensor(data=data, requires_grad=req_grad, depends_on=depends_on)

def _tensor_pow(t: Tensor, power:float) -> Tensor:
    # TODO
    data = np.power(t.data, power)
    req_grad = t.requires_grad

    if req_grad:
        def grad_fn(grad: np.ndarray):
            return grad * power * np.power(t.data, power - 1)

        depends_on = [Dependency(t, grad_fn)]

    else:
        depends_on = []

    return Tensor(data=data, requires_grad=req_grad, depends_on=depends_on)

def _tensor_slice(t: Tensor, idcs) -> Tensor:
    # TODO
    data = t.data[idcs]
    requires_grad = t.requires_grad

    if requires_grad:
        def grad_fn(grad: np.ndarray) -> np.ndarray:
            bigger_grad = np.zeros_like(data)
            bigger_grad[idcs] = grad
            return bigger_grad

        depends_on = Dependency(t, grad_fn)
    else:
        depends_on = []

    return Tensor(data, requires_grad, depends_on)

def _tensor_neg(t: Tensor) -> Tensor:
    # TODO
    data = np.negative(t.data)
    requires_grad = t.requires_grad
    if requires_grad:
        depends_on = [Dependency(t, lambda x: -x)]
    else:
        depends_on = []

    return Tensor(data, requires_grad, depends_on)

def _add(t1: Tensor, t2: Tensor) -> Tensor:

    data = t1.data + t2.data
    req_grad = t1.requires_grad or t2.requires_grad
    depends_on : List[Dependency] = []

    if t1.requires_grad:
        def grad_fn1(grad: np.ndarray) -> np.ndarray:
            ndims_added = grad.ndim - t1.data.ndim
            for _ in range(ndims_added):
                grad = grad.sum(axis=0)
            for i, dim in enumerate(t1.shape):
                if dim == 1:
                    grad = grad.sum(axis=i, keepdims=True)
            return grad
        depends_on.append(Dependency(t1, grad_fn1))

    if t2.requires_grad:
        def grad_fn2(grad: np.ndarray) -> np.ndarray:
            ndims_added = grad.ndim - t2.data.ndim
            for _ in range(ndims_added):
                grad = grad.sum(axis=0)
            for i, dim in enumerate(t2.shape):
                if dim == 1:
                    grad = grad.sum(axis=i, keepdims=True)
            return grad
        depends_on.append(Dependency(t2, grad_fn2))

    return Tensor(
        data=data,
        requires_grad=req_grad,
        depends_on=depends_on
    )

def _sub(t1: Tensor, t2: Tensor) -> Tensor:
    # TODO: implement sub
    # Hint: a-b = a+(-b)
    return _add(t1, _tensor_neg(t2))

def _mul(t1: Tensor, t2: Tensor) -> Tensor:
    # Done ( Don't change )
    data = t1.data * t2.data
    req_grad = t1.requires_grad or t2.requires_grad
    depends_on : List[Dependency] = []

    if t1.requires_grad:
        def grad_fn1(grad: np.ndarray) -> np.ndarray:
            grad = grad * t2.data
            ndims_added = grad.ndim - t1.data.ndim
            for _ in range(ndims_added):
                grad = grad.sum(axis=0)
            for i, dim in enumerate(t1.shape):
                if dim == 1:
                    grad = grad.sum(axis=i, keepdims=True)
            return grad
        depends_on.append(Dependency(t1, grad_fn1))
    if t2.requires_grad:
        def grad_fn2(grad: np.ndarray) -> np.ndarray:
            grad = grad * t1.data
            ndims_added = grad.ndim - t2.data.ndim
            for _ in range(ndims_added):
                grad = grad.sum(axis=0)
            for i, dim in enumerate(t2.shape):
                if dim == 1:
                    grad = grad.sum(axis=i, keepdims=True)
            return grad
        depends_on.append(Dependency(t2, grad_fn2))

    return Tensor(
        data=data,
        requires_grad=req_grad,
        depends_on=depends_on
    )

def _matmul(t1: Tensor, t2: Tensor) -> Tensor:
    # TODO: implement matrix multiplication
    data = t1.data @ t2.data
    requires_grad = t1.requires_grad or t2.requires_grad

    depends_on: List[Dependency] = []

    if t1.requires_grad:
        def grad_fn1(grad: np.ndarray) -> np.ndarray:
            return grad @ t2.data.T
        depends_on.append(Dependency(t1, grad_fn1))

    if t2.requires_grad:
        def grad_fn2(grad: np.ndarray) -> np.ndarray:
            return t1.data.T @ grad
        depends_on.append(Dependency(t2, grad_fn2))

    return Tensor(data,
                  requires_grad,
                  depends_on)


In [98]:
#####activation######

def Sigmoid(t: Tensor) -> Tensor:
    # TODO: implement sigmoid function
    # hint: you can do it using function you've implemented (not directly define grad func)
    return Tensor(data=1 / (1 + (-t).exp()), requires_grad=t.requires_grad, depends_on=t.depends_on)

def Tanh(t: Tensor) -> Tensor:
    # TODO: implement tanh function
    # hint: you can do it using function you've implemented (not directly define grad func)
    return Tensor(data=(t.exp() - (-t).exp())/(t.exp() + (-t).exp()), requires_grad=t.requires_grad, depends_on=t.depends_on)

def Softmax(t: Tensor) -> Tensor:
    # TODO: implement softmax function
    # hint: you can do it using function you've implemented (not directly define grad func)
    # hint: you can't use sum because it has not axis argument so there are 2 ways:
    #        1. implement sum by axis
    #        2. using matrix mul to do it :) (recommended)
    # hint: a/b = a*(b^-1)
    sum = t.exp() @ np.ones((t.data.shape[1], 1))
    # print("vvvvvvvvvvvvvvv",sum.data, "qqqqqqqqqqqqqqqqq")
    return t.exp() * Tensor(1/sum.data)

def Relu(t: Tensor) -> Tensor:
    # TODO: implement relu function

    # use np.maximum
    data = np.maximum(np.zeros_like(t.data), t.data)

    req_grad = t.requires_grad
    if req_grad:
        def grad_fn(grad: np.ndarray):
            # use np.where
            return np.where(t.data > 0, grad,np.zeros_like(grad))

        depends_on = [Dependency(t, grad_fn)]
    else:
        depends_on = []
    return Tensor(data=data, requires_grad=req_grad, depends_on=depends_on)


def LeakyRelu(t: Tensor,leak=0.05) -> Tensor:
    """
    fill 'data' and 'req_grad' and implement LeakyRelu grad_fn
    hint: use np.where like Relu method but for LeakyRelu
    """
    # TODO: implement leaky_relu function

    data = np.where(t.data > 0, t.data, leak * t.data)

    req_grad = t.requires_grad
    if req_grad:
        def grad_fn(grad: np.ndarray):
            return np.where(t.data > 0, grad, leak * grad)

        depends_on = [Dependency(t, grad_fn)]
    else:
        depends_on = []

    return Tensor(data=data, requires_grad=req_grad, depends_on=depends_on)


In [99]:
##########initializers############
# TODO: implement xavier_initializer, zero_initializer

def xavier_initializer(shape):
    return np.random.randn(*shape) * np.sqrt(1/shape[0], dtype=np.float64)

def he_initializer(shape):
    return np.random.randn(*shape) * np.sqrt(2/shape[0], dtype=np.float64)


def zero_initializer(shape):
    return np.zeros(shape, dtype=np.float64)

def one_initializer(shape):
    return np.ones(shape, dtype=np.float64)

def initializer(shape, mode="xavier"):
    if mode == "xavier":
        return xavier_initializer(shape)
    elif mode == "he":
        return he_initializer(shape)
    elif mode == "zero":
        return zero_initializer(shape)
    elif mode == "one":
        return one_initializer(shape)
    else:
        raise NotImplementedError("Not implemented initializer method")


In [100]:
# @title Default title text
##########fc_layer############
class Linear:

    def __init__(self, in_channels, out_channels, need_bias=True, mode='zero') -> None:
        # set input and output shape of layer
        self.shape = (in_channels, out_channels)
        self.need_bias = need_bias
        # TODO initialize weight by initializer function (mode)
        self.weight = Tensor(
            data=initializer((in_channels, out_channels), mode=mode),
            requires_grad=True
        )
        # TODO initialize weight by initializer function (zero mode)
        if self.need_bias:
            self.bias = Tensor(
                data=0,
                requires_grad=True
            )

    def forward(self, inp):
      # TODO:implement forward propagation
      if self.need_bias:
          return inp.__matmul__(self.weight) + self.bias
      return inp.__matmul__(self.weight)

    def parameters(self):
        if self.need_bias:
            return [self. weight, self.bias]
        return [self. weight]

    def zero_grad(self):
        self.weight.zero_grad()
        if self.need_bias:
            self.bias.zero_grad()

    def __call__(self, inp):
        return self.forward(inp)


In [101]:
##########loss############

def MeanSquaredError(preds: Tensor, actual: Tensor):
    # TODO : implement mean squared error
    err = ((actual - preds) * (actual - preds)).sum()
    res = err * (Tensor(1 / len(actual.data), requires_grad=True, depends_on=actual.depends_on))
    return res

def CategoricalCrossEntropy(preds: Tensor, actual: Tensor):
    # TODO : imlement categorical cross entropy
    sum = 0
    for i in range(len(preds.data)):
        sum -= actual.data[i] * np.log(preds.data[i])
    return None



In [102]:
##########optim############
class Optimizer:
    def __init__(self, layers):
        self.layers = layers

    def zero_grad(self):
        for l in self.layers:
            l.zero_grad()

In [103]:
##########sgd############
# TODO: implement step function
class SGD(Optimizer):
    def __init__(self, layers, learning_rate=0.1):
        super().__init__(layers)
        self.learning_rate = learning_rate

    def step(self):
        # TODO: update weight and biases ( Don't use '-=' and use l.weight = l.weight - ... )
        for l in self.layers:
            temp = l.parameters()
            params = temp[0]
            bias = temp[1]
            l.weight = l.weight - self.learning_rate * params.grad
            if l.need_bias:
                l.bias = l.bias - self.learning_rate * bias.grad


In [104]:
# TODO: in this task you have to
# 1. load mnist dataset for our framework
transform=transforms.Compose([
        ToTensor(),
        transforms.Normalize(0,1)
        ])
train_set = datasets.MNIST('./Datasets', download=True, train=True, transform=transform)

test_set = datasets.MNIST('./Datasets', download=True, train=False, transform=transform)

# train_set = datasets.MNIST('./Datasets', download=True, train=True, transform=transform)
train = Tensor([i[0][0].data.numpy() for i in train_set])
train_label = Tensor([i[1] for i in train_set])
test = Tensor([i[0][0].data.numpy() for i in test_set])
test_label = Tensor([i[1] for i in test_set])
# x = Tensor(train_set[1][0][0].data, requires_grad=True)



In [105]:
# 2. define your model
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        # self.fc1 = Linear(28 * 28, 256)
        # self.fc2 = Linear(256, 10)
        self.fc3 = Linear(28 * 28, 10)
        self.relu = Relu
        self.softmax = Softmax
        # TODO: define layers of your model
    def forward(self, x):
        x = Tensor(x.data.reshape(-1,28*28))
        # x = self.fc1(x)
        # x = self.relu(x)
        # x = self.fc2(x)
        x = self.fc3(x)

        # TODO: define forward for your model
        return self.softmax(x)


model = Model()

learning_rate = 0.007
optimizer = SGD([model.fc3,], learning_rate)

In [106]:
# 3. start training and have fun!
print(model.fc3.weight)
print(model.fc3.bias)


batch_size = 5

for epoch in range(1):

    epoch_loss = 0.0

    for start in range(0, 4500, batch_size):
        optimizer.zero_grad()
        end = start + batch_size
        print(start, end)

        inputs = train[start:end]


        # TODO: predicted
        predicted = model.forward(inputs)

        actual = train_label[start:end]
        actual.data = actual.data.reshape(batch_size, 1)
        # TODO: calcualte MSE loss
        loss =MeanSquaredError(predicted, actual)


        # TODO: backward
        # hint you need to just do loss.backward()
        loss.backward()
        # print("new params", params)
        # TODO: add loss to epoch_loss
        epoch_loss += loss


        # TODO: update w and b using optimizer.step()
        optimizer.step()



# TODO: print weight and bias of linear layer

print(model.fc3.weight)
print(model.fc3.bias)


Tensor([[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], requires_grad=True)
Tensor(0, requires_grad=True)
0 5
5 10
10 15
15 20
20 25
25 30
30 35
35 40
40 45
45 50
50 55
55 60
60 65
65 70
70 75
75 80
80 85
85 90
90 95
95 100
100 105
105 110
110 115
115 120
120 125
125 130
130 135
135 140
140 145
145 150
150 155
155 160
160 165
165 170
170 175
175 180
180 185
185 190
190 195
195 200
200 205
205 210
210 215
215 220
220 225
225 230
230 235
235 240
240 245
245 250
250 255
255 260
260 265
265 270
270 275
275 280
280 285
285 290
290 295
295 300
300 305
305 310
310 315
315 320
320 325
325 330
330 335
335 340
340 345
345 350
350 355
355 360
360 365
365 370
370 375
375 380
380 385
385 390
390 395
395 400
400 405
405 410
410 415
415 420
420 425
425 430
430 435
435 440
440 445
445 450
450 455
455 460
460 465
465 470
470 475
475 480
480 485
485 490
490 495
495 500
500 505
505 510
510 515
515 

Exception ignored in: <function _xla_gc_callback at 0x7fcf52c0d480>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 97, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


1520 1525
1525 1530
1530 1535
1535 1540
1540 1545
1545 1550
1550 1555
1555 1560
1560 1565
1565 1570
1570 1575
1575 1580
1580 1585
1585 1590
1590 1595
1595 1600
1600 1605
1605 1610
1610 1615
1615 1620
1620 1625
1625 1630
1630 1635
1635 1640
1640 1645
1645 1650
1650 1655
1655 1660
1660 1665
1665 1670
1670 1675
1675 1680
1680 1685
1685 1690
1690 1695
1695 1700
1700 1705
1705 1710
1710 1715
1715 1720
1720 1725
1725 1730
1730 1735
1735 1740
1740 1745
1745 1750
1750 1755
1755 1760
1760 1765
1765 1770
1770 1775
1775 1780
1780 1785
1785 1790
1790 1795
1795 1800
1800 1805
1805 1810
1810 1815
1815 1820
1820 1825
1825 1830
1830 1835
1835 1840
1840 1845
1845 1850
1850 1855
1855 1860
1860 1865
1865 1870
1870 1875
1875 1880
1880 1885
1885 1890
1890 1895
1895 1900
1900 1905
1905 1910
1910 1915
1915 1920
1920 1925
1925 1930
1930 1935
1935 1940
1940 1945
1945 1950
1950 1955
1955 1960
1960 1965
1965 1970
1970 1975
1975 1980
1980 1985
1985 1990
1990 1995
1995 2000
2000 2005
2005 2010
2010 2015
2015 2020


  data = np.exp(t.data)


2620 2625
2625 2630
2630 2635
2635 2640
2640 2645
2645 2650
2650 2655
2655 2660
2660 2665
2665 2670
2670 2675
2675 2680
2680 2685
2685 2690
2690 2695
2695 2700
2700 2705
2705 2710
2710 2715
2715 2720
2720 2725
2725 2730
2730 2735
2735 2740
2740 2745
2745 2750
2750 2755
2755 2760
2760 2765
2765 2770
2770 2775
2775 2780
2780 2785
2785 2790
2790 2795
2795 2800
2800 2805
2805 2810
2810 2815
2815 2820
2820 2825
2825 2830
2830 2835
2835 2840
2840 2845
2845 2850
2850 2855
2855 2860
2860 2865
2865 2870
2870 2875
2875 2880
2880 2885
2885 2890
2890 2895
2895 2900
2900 2905
2905 2910
2910 2915
2915 2920
2920 2925
2925 2930
2930 2935
2935 2940
2940 2945
2945 2950
2950 2955
2955 2960
2960 2965
2965 2970
2970 2975
2975 2980
2980 2985
2985 2990
2990 2995
2995 3000
3000 3005
3005 3010
3010 3015
3015 3020
3020 3025
3025 3030
3030 3035
3035 3040
3040 3045
3045 3050
3050 3055
3055 3060
3060 3065
3065 3070
3070 3075
3075 3080
3080 3085
3085 3090
3090 3095
3095 3100
3100 3105
3105 3110
3110 3115
3115 3120


In [None]:
print(model.fc1.weight)
print(model.fc1.bias)

print(model.fc2.weight)
print(model.fc2.bias)

AttributeError: ignored

In [107]:
pred = model.forward(test[0])
print(pred)
print(test_label[0])

Tensor([[nan nan nan nan nan nan nan nan nan nan]], requires_grad=True)
Tensor(7, requires_grad=False)
