# Casting Classification as Regression, Regressing to Probabilities
1. We can turn classification labels into a one-hot vector.
2. We can regress to the vector.
3. To produce output classes, we can take the element with highest weight.
4. The regressed value can be interpreted as an (approximate) probability.

Regressing to probabilities is a useful trick, especially when we start thinking about confidences and unsupervised data analysis.

[Link to Fish Dataset Details](https://www.kaggle.com/aungpyaeap/fish-market)

In [1]:
import numpy as np
import csv

rows = []

with open('Fish.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        rows.append(row)

print(len(rows))
print(rows[0]) # first row is a header
print(rows[1])

rows = rows[1:]

labels = {} # Create a dictionary of label strings to numeric values
for row in rows:
    if row[0] not in labels:
        labels[row[0]]=len(labels)

print(labels)
        
inputs = np.array([row[1:] for row in rows])
#print(inputs)
real_outputs = np.array([labels[row[0]] for row in rows])
#print(outputs)

160
['\ufeffSpecies', 'Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']
['Bream', '242', '23.2', '25.4', '30', '11.52', '4.02']
{'Bream': 0, 'Roach': 1, 'Whitefish': 2, 'Parkki': 3, 'Perch': 4, 'Pike': 5, 'Smelt': 6}


In [2]:
def output_to_one_hot(categories, max_val):
    data = np.zeros((len(categories), max_val))
    data[np.arange(len(categories)), categories] = 1
    return data

In [3]:
def output_to_one_hot(categories, max_val):
    data = np.zeros((len(categories), max_val))
    data[np.arange(len(categories)), categories] = 1
    return data

encodings = output_to_one_hot(real_outputs, len(labels))
print(encodings[:10])
print(encodings[-10:])

[[1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1.]]


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, encodings)

# Assignment:
1. Define a network class that regresses to the 7 outputs.
2. Train a sufficiently large network to perform the categorization.
3. Measure the test accuracy of the model by counting the number of accurate labels

# Stretch Goals:
- Test out different network architectures (depth, breadth) and examine training performance.

In [None]:
from __future__ import print_function
import torch; print(torch.__version__)
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as utils
import matplotlib.pyplot as plt
%matplotlib inline



t_inputs=Variable(torch.from_numpy(inputs.astype(np.float32)))
t_encodings=Variable(torch.from_numpy(encodings.astype(np.float32)))

#t_inputs=Variable(torch.from_numpy(x_train.astype(np.float32)))
#t_encodings=Variable(torch.from_numpy(y_train.astype(np.float32)))

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # an affine operation: y = Wx + b
        # 2 hidden layers?
        self.fc1 = nn.Linear(6, 15)
        self.fc2 = nn.Linear(15, 15)
        self.fc3 = nn.Linear(15, 7)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
net = Net()
#print(net)

#t_inputs=net(Variable(torch.Tensor(inputs)))

net.zero_grad()
outputs = net(Variable(torch.Tensor([0,0,0,0,0,0])))
outputs.backward(torch.randn(7)) # Use random gradients to break symmetry?


learning_rate = 1 # Need to initialize carefully
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)
    
from tqdm import trange # Used to provide progress bar

losses = []

# create your optimizer
optimizer = optim.Adam(net.parameters())
criterion = nn.MSELoss()

num_epochs = 100000
t = trange(num_epochs)
for epoch in t:  # loop over the dataset multiple times

    running_loss = 0.0

    # wrap them in Variable
    #reshaped_inputs = t_inputs.view(-1, 1) # Structure with each input in its own row
    #reshaped_outputs = true_vals.view(-1, 1) # Neglecting to have outputs and true vals to match dimension is a common mistake.

    # forward + backward + optimize
    outputs = net(t_inputs)
    #print(outputs)
    #print(reshaped_outputs)
    
    #encodings = np.zeros((len(outputs), 7))
    #encodings[np.arange(len(outputs)), outputs] = 1
    #encodings = output_to_one_hot(outputs, len(labels))
    #loss = criterion(outputs, t_real_outputs)
    
    #print(outputs.shape)
    #print(t_real_outputs.shape)
    
    #loss = criterion(outputs, t_real_outputs)
    loss = criterion(outputs, t_encodings)
    losses.append(loss)
    
    loss.backward()
    optimizer.step()
    
    # zero the parameter gradients
    optimizer.zero_grad()
    
    t.set_description('ML (loss=%g)' % loss.item()) # Updates Loss information
    #t.set_description('ML Loss: ' + str(loss.item())) # Updates Loss information

print('Finished Training')

print(outputs)




1.3.0


ML (loss=0.0845054):   8%|▊         | 8473/100000 [00:09<01:27, 1040.59it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

ML (loss=0.0413645):  15%|█▌        | 15139/100000 [00:15<01:24, 1010.18it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

ML (loss=0.023817):  19%|█▉        | 18755/100000 [00:19<01:20, 1004.68it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid cr

In [33]:
real_categories = [max(range(len(vector)), key=lambda index:vector[index]) for vector in encodings]
categories = [max(range(len(vector)), key=lambda index:vector[index]) for vector in outputs]

score=0
for i in range(0, len(categories)):
    if categories[i]==real_categories[i]:
        score = score + 1
        

print('score =',score)
print('possible score=',len(categories))

score = 157
possible score= 159
