In [80]:
from __future__ import print_function
from itertools import count

import torch
import torch.autograd
import torch.nn.functional as F
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import pandas as pd

# Question 2.a

## Modified code for Stochastic Gradient Descent

In [215]:
POLY_DEGREE = 4
W_target = torch.randn(POLY_DEGREE, 1) * 5
b_target = torch.randn(1) * 5


def make_features(x):
    """Builds features i.e. a matrix with columns [x, x^2, x^3, x^4]."""
    x = x.unsqueeze(1)
    return torch.cat([x ** i for i in range(1, POLY_DEGREE+1)], 1)


def f(x):
    """Approximated function."""
    return x.mm(W_target) + b_target[0]


def poly_desc(W, b):
    """Creates a string description of a polynomial."""
    result = 'y = '
    for i, w in enumerate(W):
        result += '{:+.2f} x^{} '.format(w, len(W) - i)
    result += '{:+.2f}'.format(b[0])
    return result


def get_batch(batch_size=32):
    """Builds a batch i.e. (x, f(x)) pair."""
    random = torch.randn(batch_size)
    x = make_features(random)
    y = f(x)
    return Variable(x), Variable(y)


# Define model
fc = torch.nn.Linear(W_target.size(0), 1)

batch_size =32

# Define optimizer
optimizer = optim.SGD(fc.parameters(), lr=0.0005, momentum = 0.9)

for batch_idx in count(1):
    # Get data
    batch_x, batch_y = get_batch(batch_size)
    for i in torch.arange(batch_size):
        i = int(i)
        x, label = Variable(batch_x.data[i,:]), Variable(batch_y.data[i.real])
        
        # Reset gradients
        optimizer.zero_grad()

        # Forward pass
        output = fc(x)
        loss = F.smooth_l1_loss(output, label)
        

        # Backward pass
        loss.backward()

        # Apply gradients
        #for param in fc.parameters():
        #    param.data.add_(-0.1 * param.grad.data)
        optimizer.step()

        # Stop criterion
        #if loss.data[0] < 1e-4:
            #break
            
    print('Epoch: {0}, loss: {1}, parameters: {2}, bias: {3}'.format(batch_idx, loss.data[0], fc.weight.data, fc.bias.data))
    if loss.data[0] < 1e-4:
            break

print('Loss: {:.6f} after {} batches'.format(loss.data[0], batch_idx))
print('==> Learned function:\t' + poly_desc(fc.weight.data.view(-1), fc.bias.data))
print('==> Actual function:\t' + poly_desc(W_target.view(-1), b_target))

Epoch: 1, loss: 2.037508249282837, parameters: 
-0.1283  0.3515  0.1398 -0.5500
[torch.FloatTensor of size 1x4]
, bias: 
 0.4764
[torch.FloatTensor of size 1]

Epoch: 2, loss: 1.3746095895767212, parameters: 
-0.0135  0.2965  0.3693 -0.8504
[torch.FloatTensor of size 1x4]
, bias: 
 0.5223
[torch.FloatTensor of size 1]

Epoch: 3, loss: 2.1931252479553223, parameters: 
 0.0586  0.2886  0.4437 -0.9662
[torch.FloatTensor of size 1x4]
, bias: 
 0.5832
[torch.FloatTensor of size 1]

Epoch: 4, loss: 7.806362152099609, parameters: 
 0.1237  0.2690  0.4788 -1.1452
[torch.FloatTensor of size 1x4]
, bias: 
 0.6556
[torch.FloatTensor of size 1]

Epoch: 5, loss: 0.14574770629405975, parameters: 
 0.2161  0.3051  0.5801 -1.1003
[torch.FloatTensor of size 1x4]
, bias: 
 0.7379
[torch.FloatTensor of size 1]

Epoch: 6, loss: 3.991950035095215, parameters: 
 0.2988  0.2574  0.6223 -1.2856
[torch.FloatTensor of size 1x4]
, bias: 
 0.7407
[torch.FloatTensor of size 1]

Epoch: 7, loss: 11.176223754882812, 


Epoch: 60, loss: 2.608917236328125, parameters: 
 3.2395  2.5370  2.7827 -3.1538
[torch.FloatTensor of size 1x4]
, bias: 
 2.7696
[torch.FloatTensor of size 1]

Epoch: 61, loss: 0.21624626219272614, parameters: 
 3.2825  2.5065  2.7758 -3.4314
[torch.FloatTensor of size 1x4]
, bias: 
 2.7396
[torch.FloatTensor of size 1]

Epoch: 62, loss: 0.5411902666091919, parameters: 
 3.3365  2.4688  2.9032 -4.0248
[torch.FloatTensor of size 1x4]
, bias: 
 2.7105
[torch.FloatTensor of size 1]

Epoch: 63, loss: 1.0713615417480469, parameters: 
 3.3594  2.5860  2.9046 -3.8118
[torch.FloatTensor of size 1x4]
, bias: 
 2.7387
[torch.FloatTensor of size 1]

Epoch: 64, loss: 1.1050376892089844, parameters: 
 3.3631  2.6752  2.7618 -3.7139
[torch.FloatTensor of size 1x4]
, bias: 
 2.7595
[torch.FloatTensor of size 1]

Epoch: 65, loss: 0.6095149517059326, parameters: 
 3.3852  2.7373  2.6677 -3.8473
[torch.FloatTensor of size 1x4]
, bias: 
 2.7633
[torch.FloatTensor of size 1]

Epoch: 66, loss: 1.89784049

## Observations on change in learning rate

For learning rate = 0.0005, loss = 0.000007<br>
For learning rate = 0.001, loss = 0.000025<br>
For learning rate = 0.005, loss = 0.000079<br>
For learning rate = 0.009, loss = 0.000004<br>
For learning rate = 0.01, loss = 0.000078<br>
For learning rate = 0.05, loss = 0.000006<br>

Some higher learning rates do give lower loss but take higher batches to converge. This would be because the loss varying a lot in higher learning rates. Best learning rate found is 0.0005

# Question 2.b

In [216]:
dataset = pd.read_csv('qn2_data.csv').values
X = torch.from_numpy(dataset[:,0:2])
X = X.type(torch.FloatTensor)
y = torch.from_numpy(dataset[:,2])
y = y.type(torch.FloatTensor)
w = torch.randn(X.size()[1])
#bias = Variable(torch.rand(1))

fc = torch.nn.Linear(w.size(0), 1)
num_epochs = 80000
# Define optimizer
optimizer = optim.SGD(fc.parameters(), lr=0.0005)

for epoch in range(num_epochs):
    for i in range(X.size()[0]):
        x = Variable(X[i,:])
        label = Variable(torch.take(y, torch.LongTensor([i])))
        output = fc(x)
        loss = F.smooth_l1_loss(output, label)
        
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        avg_loss = F.smooth_l1_loss(fc(Variable(X)), Variable(y))
        if avg_loss.data[0] < 0.6:
            break   
            
    #avg_loss = F.smooth_l1_loss(fc(Variable(X)), Variable(y))
    if avg_loss.data[0] < 0.6:
        break
           
print(epoch)
print('Loss: {0}'.format(avg_loss.data[0]))
print('Weights: {0} \nBias{1}'.format(fc.weight.data, fc.bias.data))

test_set = Variable(torch.Tensor([[6,4],[10,5],[14,8]]))
predictions = fc(test_set)
print('Predictions: {0}'.format(predictions))

79999
Loss: 0.6411795616149902
Weights: 
 0.6302  1.1562
[torch.FloatTensor of size 1x2]
 
Bias
 32.0196
[torch.FloatTensor of size 1]

Predictions: Variable containing:
 40.4256
 44.1025
 50.0919
[torch.FloatTensor of size 3x1]



# Question 2.c

In [218]:
dataset = pd.read_csv('qn2_data.csv').values
X = torch.from_numpy(dataset[:,0:2])
X = X.type(torch.FloatTensor)
X = torch.cat((X, torch.ones(X.size()[0])), 1)
y = torch.from_numpy(dataset[:,2])
y = y.type(torch.FloatTensor)
X_t = torch.t(X)

# Calculate weights using closed form
w = torch.inverse(torch.mm(X_t,X)) 
w = torch.mm(w, X_t)
w = torch.mm(w, torch.unsqueeze(y,1))

# Calculate training loss on found weights
loss = F.smooth_l1_loss(Variable(torch.mm(X, w)), Variable(y))
print('Loss:{0}'.format(loss.data[0]))

test_set = Variable(torch.Tensor([[6,4],[10,5],[14,8]]))
ones = Variable(torch.ones(test_set.size()[0]))
test_set = Variable.cat((test_set, ones), 1)
print('\nWeights (including bias): {0}'.format(w))
print('\nPredictions:')
predictions = Variable.mm(test_set, Variable(w))
print(predictions)

Loss:0.6334885954856873

Weights (including bias): 
  0.5383
  1.2110
 32.8729
[torch.FloatTensor of size 3x1]


Predictions:
Variable containing:
 40.9469
 44.3112
 50.0975
[torch.FloatTensor of size 3x1]



### Observations

The loss, weights and predictions are similar to linear neuron when bias is used in closed form.<br>
Without use of bias the results are very poor. <br>This shows, at least on smaller data, closed form approach gives similar results to linear regression using gradients.