Tutorial from [here](https://pytorch.org/tutorials/beginner/nn_tutorial.html#refactor-using-dataloader)

In [19]:
import numpy as np
import pandas as pd
import torch
import math
import torch.nn as nn
from torch.utils.data import DataLoader
# import matplotlib.pyplot as plt
# from IPython.core.debugger import set_trace

In [4]:
train = pd.read_csv('C:/Users/Spurius/Desktop/digit-recognizer/train.csv')
test = pd.read_csv('C:/Users/Spurius/Desktop/digit-recognizer/test.csv')
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
x_train = torch.tensor(train.values[:, 1:]).float() / 255
y_train = torch.tensor(train.values[:, :1]).long()
x_test = torch.tensor(train.values[:, 1:]).float() / 255
y_test = torch.tensor(train.values[:, :1]).long()

1. 'requires_grad' makes pytorch record all the operations done on the object, so it can compute gradients automai=tically, when they are needed.
2. Underscore after the name in Pytorch means that operation is performed in-place.
3. np.sqrt(784) means Xavier initialization of weights

In [6]:
weights = torch.randn(784, 10) / math.sqrt(784)
weights.requires_grad_()
bias = torch.zeros(10, requires_grad=True)

In [7]:
def log_softmax(x):
    return x - x.exp().sum(-1).log().unsqueeze(-1)

def model(xb):
    return log_softmax(xb @ weights + bias)

In [8]:
batch_size = 64

xb = x_train[0:batch_size]

preds = model(xb)
# preds[0], preds.shape
print(preds[0], preds.shape)

tensor([-2.3651, -2.5389, -2.1003, -2.6187, -2.6212, -2.4594, -2.0193, -2.3627,
        -2.2161, -1.9831], grad_fn=<SelectBackward>) torch.Size([64, 10])


In [9]:
# implementation of negative-log-likelihood
def nll(inp, target):
    return -inp[range(target.shape[0]), target].mean()

loss_func = nll

In [10]:
yb = y_train[:batch_size]
print(loss_func(preds, yb.squeeze(1)))

tensor(2.3207, grad_fn=<NegBackward>)


In [11]:
def accuracy(output, target):
    preds = torch.argmax(output, dim=1)
    return (preds == target).float().mean()

In [12]:
print(accuracy(preds, yb))

tensor(0.1084)


Now we can train our simple "model" for 2 epochs.
1. get the data in portions of batch_size
2. use the model to make predictions
3. calculate the loss
4. update the gradients with *loss.backward()*

Don't forget to use zero_grad(), because otherwise all gradients will be added to each other, rather then replaced and the whole idea of learning will be lost.

In [13]:
lr = 0.05
epochs = 2
n, d = x_train.shape
#this loop doesn't work :(
# for epoch in range(epochs):
#     for i in range((n - 1) // batch_size + 1):
# #         set_trace()
#         start = i * batch_size
#         end = i + batch_size
#         x_batch = x_train[start:end]
#         y_batch = y_train[start:end]
#         output = model(x_batch)
#         loss = loss_func(output, y_batch)
        
#         loss.backward()
#         with torch.no_grad():
#             weights -= weights.grad * lr
#             weights.grad.zero_()
#             bias -= bias.grad * lr
#             bias.grad.zero_()
        
print(loss_func(model(xb), yb), accuracy(model(xb), yb))

tensor(2.3361, grad_fn=<NegBackward>) tensor(0.1084)


In [14]:
import torch.nn.functional as F
loss_func = nn.CrossEntropyLoss()
def model(xb):
    return xb @ weights + bias

In [15]:
print(loss_func(model(xb), yb.squeeze(1)), accuracy(model(xb), yb))

tensor(2.3207, grad_fn=<NllLossBackward>) tensor(0.1084)


In [16]:
class Mnist_logistic(nn.Module):
    def __init__(self):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
        self.bias = nn.Parameter(torch.zeros(10))
    def forward(self, xb):
        return xb @ self.weights + self.bias

In [17]:
model = Mnist_logistic()
preds = model(xb)

In [21]:
print(loss_func(model(xb), yb.squeeze(1)))

tensor(2.4369, grad_fn=<NllLossBackward>)


In [27]:
with torch.no_grad():
    for p in model.parameters(): p -= p.grad * lr
    model.zero_grad()

TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'

In [29]:
def fit():
    for epoch in range(epochs):
        for i in range((n - 1) // bs + 1):
            start_i = i * bs
            end_i = start_i + bs
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            pred = model(xb)
            loss = loss_func(pred, yb)

            loss.backward()
            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()

In [45]:
za = np.array([1,2,3,4,5, 6,7,8,9,10])
zz = np.arange(10)
np.concatenate((za[:, None], zz[:, None]), axis=0)

array([[ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9]])

In [47]:
Z = np.random.random((500,10))
y = np.arange(500)
print(Z.shape)
print(y.shape)
Z[range(500), y]

(500, 10)
(500,)


IndexError: index 10 is out of bounds for axis 1 with size 10

In [50]:
Z[range(500)]

array([[0.63742076, 0.25665064, 0.90175448, ..., 0.75322074, 0.63923059,
        0.57162788],
       [0.20526512, 0.83974937, 0.72516953, ..., 0.64675901, 0.44115046,
        0.48636321],
       [0.25996247, 0.24875269, 0.33576911, ..., 0.26524782, 0.3995636 ,
        0.12325945],
       ...,
       [0.68858886, 0.27389153, 0.47139864, ..., 0.01794308, 0.09374944,
        0.32987518],
       [0.34035151, 0.9706666 , 0.94222176, ..., 0.08671782, 0.17814423,
        0.88292637],
       [0.45364157, 0.21209501, 0.75740932, ..., 0.82283435, 0.86092953,
        0.29787148]])

In [41]:
np.concatenate?

[1;31mDocstring:[0m
concatenate((a1, a2, ...), axis=0, out=None)

Join a sequence of arrays along an existing axis.

Parameters
----------
a1, a2, ... : sequence of array_like
    The arrays must have the same shape, except in the dimension
    corresponding to `axis` (the first, by default).
axis : int, optional
    The axis along which the arrays will be joined.  If axis is None,
    arrays are flattened before use.  Default is 0.
out : ndarray, optional
    If provided, the destination to place the result. The shape must be
    correct, matching that of what concatenate would have returned if no
    out argument were specified.

Returns
-------
res : ndarray
    The concatenated array.

See Also
--------
ma.concatenate : Concatenate function that preserves input masks.
array_split : Split an array into multiple sub-arrays of equal or
              near-equal size.
split : Split array into a list of multiple sub-arrays of equal size.
hsplit : Split array into multiple sub-arrays ho