## Classification:
X => n_samples x n_features array  
Y => n_samples array (discrete labels)  
SVM: X => Y  

In [None]:
import matplotlib.pyplot as plt
from sklearn import svm, datasets
import numpy as np

In [None]:
N = 300
X, Y = datasets.make_blobs(n_features=2, centers=3, n_samples=N)
split = int(N/2)
train_X = X[:split, :]
test_X = X[split:, :]
train_Y = Y[:split]
test_Y = Y[split:]

In [None]:
plt.title("Three blobs")
plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y, s=25, edgecolor='k')
plt.show()

In [None]:
model = svm.SVC(kernel='rbf', gamma=0.7)
model.fit(train_X,train_Y)
predicted_Y = model.predict(test_X)
mismatches = np.array([instance for index, instance in enumerate(test_X)
                       if predicted_Y[index] != test_Y[index]])

plt.title("Predicted")
plt.scatter(test_X[:, 0], test_X[:, 1], marker='o', c=predicted_Y, s=25, edgecolor='k')
if len(mismatches)>0:
    plt.scatter(mismatches[:,0], mismatches[:,1], marker='x', c='red')
else:
    print('Perfect Test Performance!')
plt.show()

## A note about kernels
SVM "kernels" are really just so-called "kernel function" functions that compute an inner product (think dot product). They provide a similarity measure in a corresponding coordinate space. These kernel functions can be used to simulate projecting the data to higher dimensions (think about reshaping a plane as a 2-d parabola embedded in 3 dimensions). The SVM is still using linear separators (hyperplanes) in the higher dimension, but these separators can induce non-linear "decision boundaries" in the lower dimensional space.
If that all sounds confusing, don't worry about it. The important takeaway is that SVM's can represent very complicated classification functions. In practice, nearly everyone uses the radial basis function (RBF) kernel. The RBF kernel has a single parameter (usually called gamma) that you can adjust to possibly improve your accuracy.

# Train/Test/Validation Split

In [None]:
import json

with open("./book_sales_data.json", "r") as infile:
    book_sales_data = json.load(infile)
print(len(book_sales_data))
print(book_sales_data[0])

In [None]:
#Only run this once, ever. The validation data should never be changed.

from sklearn import model_selection
work_data, validation_data = model_selection.train_test_split(book_sales_data, test_size = .10)

with open("./book_sales_work_data.json", "w") as outfile:
    json.dump(work_data, outfile)

with open("./book_sales_validation_data.json", "w") as outfile:
    json.dump(validation_data, outfile)

In [None]:
import math
import matplotlib.pyplot as plt
from sklearn import svm
import numpy as np

# Use increasing amounds of training data and measure performance on the same test set.
with open("./book_sales_work_data.json", "r") as infile:
    book_sales_work_data = json.load(infile)

train_data, test_data = model_selection.train_test_split(book_sales_work_data, test_size = .20)
print("Training Length: ", len(train_data), " Testing Length: ", len(test_data))

def convert_book_data_to_arrays(data_to_convert, regress=False):
    dtc = data_to_convert
    inputs = [[dtc[i]["book_details_time"], dtc[i]["homepage_time"],
               dtc[i]["total_web_time_minutes"]]
              for i in range(len(dtc))]
    if not regress:  # If for classification
        outputs = [dtc[i]["paying_customer"]
                  for i in range(len(dtc))]
    else:
        outputs = [dtc[i]["books_sold"]
          for i in range(len(dtc))]
    return inputs, outputs

# Split the test set into inputs and expected outputs
test_X, test_Y = convert_book_data_to_arrays(test_data)

num_parts = 100
data_sizes = [math.floor(1/num_parts * (i+1) * len(train_data)) for i in range(num_parts)] # measure by 10ths.
accuracies = []
for data_size in data_sizes:
    train_subset = train_data[:data_size]
    train_X, train_Y = convert_book_data_to_arrays(train_subset)
    
    # train model
    model = svm.SVC(kernel='rbf', gamma=0.7)
    model.fit(train_X,train_Y)
    
    # measure accuracy on the test data
    predicted_Y = model.predict(test_X)
    accuracy = np.mean([predicted_Y[i] == test_Y[i] for i in range(len(predicted_Y))])
    accuracies.append(accuracy)
    
# plot accuracy vs. number of training points.
plt.plot(data_sizes, accuracies)
plt.xlabel('Number of Training Points')
plt.ylabel('Test Accuracy')
plt.ylim(0,1)
plt.show()

# Boosting and Ensembles

In [None]:
from sklearn.ensemble import AdaBoostClassifier
# The AdaBoostClassifier uses shallow decision trees by default.

#Plot test accuracy vs. number of classifiers in the ensemble
num_classifiers = list(range(1, 200+1))
train_accuracies = []
accuracies = []
for n in num_classifiers:
    model = AdaBoostClassifier(n_estimators=n)
    model.fit(train_X, train_Y)
    
    # measure training accuracy on the test data
    predicted_Y = model.predict(train_X)
    accuracy = np.mean([predicted_Y[i] == train_Y[i] for i in range(len(predicted_Y))])
    train_accuracies.append(accuracy)
    
    # measure accuracy on the test data
    predicted_Y = model.predict(test_X)
    accuracy = np.mean([predicted_Y[i] == test_Y[i] for i in range(len(predicted_Y))])
    accuracies.append(accuracy)
    
plt.plot(num_classifiers, accuracies, label="test accuracy")
plt.plot(num_classifiers, train_accuracies, label="train accuracy")
plt.xlabel('Number of Classifiers in the Ensemble')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
total_toy_data = 200
toy_split = total_toy_data//2
all_data = [[np.random.random(), np.random.random()] for i in range(total_toy_data)]
# below is just a made-up classification for the purpose of creating a harder but deterministic problem.
ys = [(datum[0]-.5)**3 - (datum[0]-.5)**2 + math.sin(datum[0]*20) < datum[1] for datum in all_data]
toy_train_X, toy_train_Y = all_data[:toy_split], ys[:toy_split]
toy_test_X, toy_test_Y = all_data[toy_split:], ys[toy_split:]

#Plot test accuracy vs. number of classifiers in the ensemble
num_classifiers = list(range(1, 200+1))
train_accuracies = []
accuracies = []
for n in num_classifiers:
    model = AdaBoostClassifier(n_estimators=n)
    model.fit(toy_train_X, toy_train_Y)
    
    # measure training accuracy on the test data
    predicted_Y = model.predict(toy_train_X)
    accuracy = np.mean([predicted_Y[i] == toy_train_Y[i] for i in range(len(predicted_Y))])
    train_accuracies.append(accuracy)
    
    # measure accuracy on the test data
    predicted_Y = model.predict(toy_test_X)
    accuracy = np.mean([predicted_Y[i] == toy_test_Y[i] for i in range(len(predicted_Y))])
    accuracies.append(accuracy)
    
plt.plot(num_classifiers, accuracies, label="test accuracy")
plt.plot(num_classifiers, train_accuracies, label="train accuracy")
plt.xlabel('Number of Classifiers in the Ensemble')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

with open("./book_sales_work_data.json", "r") as infile:
    all_work_data_X, all_work_data_Y = convert_book_data_to_arrays(json.load(infile))

model = svm.SVC(kernel='rbf', gamma=0.7)
scores = cross_val_score(model, all_work_data_X, all_work_data_Y, cv=10)
print(scores)
print(sum(scores)/len(scores))

# Neural Networks

In [None]:
# Load data for regression task and classification task.
with open("./book_sales_work_data.json", "r") as infile:
    train_data, test_data = model_selection.train_test_split(json.load(infile), test_size = .20)

class_train_X, class_train_Y = convert_book_data_to_arrays(train_data)
class_test_X, class_test_Y = convert_book_data_to_arrays(test_data)

regress_train_X, regress_train_Y = convert_book_data_to_arrays(train_data, regress=True)
regress_test_X, regress_test_Y = convert_book_data_to_arrays(test_data, regress=True)

In [None]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

# Define our neural network
class RegressionNet(nn.Module):
    def __init__(self, width):
        """width should be a natural number"""
        width = max(width, 1)
        
        super(self.__class__, self).__init__()
        
        #Define the pass-forward weights
        #Goes from 1 to width back to 1, with a nonlinear function in between.
        self.forward_1 = nn.Linear(3,width)
        self.forward_2 = nn.Linear(width,1)
        
    def forward(self, x):
        x = self.forward_2(F.relu(self.forward_1(x)))
        return x
    
    def infer_class(self, x):
        # Purely for convenience
        """Run the input forward, but then round to the nearest integer."""
        return int(math.floor(self.forward(x) + .5))

# Sometimes called the "criterion"
loss_function = nn.MSELoss()

In [None]:
def variable_wrap(some_data, convert_type=np.float):
    return Variable(torch.from_numpy(np.array(some_data).astype(convert_type)).float())

def list_map(*args, **kwargs):
    # A helper function to unroll a list generated by mapping a function
    #     onto elements of another list.
    # args stores unnamed arguments, and kwargs stores named arguments
    return list(map(*args,**kwargs))

In [None]:
from torch.optim import Adam

def learn_on_data(train_X, train_Y, test_X, test_Y, regress=False, model_width=10, file_base="./"):
    model = RegressionNet(model_width)
    optimizer = Adam(model.parameters())

    # Format the input to be read by the network.
    var_train_X, var_train_Y = list_map(variable_wrap, train_X), list_map(variable_wrap, train_Y)
    var_test_X, var_test_Y = list_map(variable_wrap, test_X), list_map(variable_wrap, test_Y)
    
    train_accuracies = []
    test_accuracies = []
    
    for data_loop in range(300):
        model.train() # Set the network to training mode. Affects some layers sometimes.
    
        optimizer.zero_grad()
        # Loop through the data and use the gradient to optimize the model parameters.
        for train_index in range(len(train_X)):
            X, Y = var_train_X[train_index], var_train_Y[train_index]

            #print(Y)
            predicted_Y = model.forward(X)
            #print(predicted_Y.item())
            loss = loss_function(predicted_Y, Y)
            #print(loss.item())
            loss.backward() # Stores the gradient for all parameters.
        optimizer.step() # Doing this outside the loop accumulates gradients across all data points.

        # Measure train and test accuracy
        model.eval() # Set model to evaluation (testing) mode. Affects some models some time.
        if not regress:
            train_predicted = list_map(model.infer_class, var_train_X)
        else:
            train_predicted = list_map(model.forward, var_train_X)
        
        if not regress:
            matches = [train_predicted[i]==var_train_Y[i] for i in range(len(var_train_X))]
            matches = list_map(int, matches) # Convert to Boolean to higher-represention-size int
            train_accuracy = sum(matches)/len(var_train_X)
        else:
            # In this setting measure error as absolute value of the difference
            error = [abs(train_predicted[i]-var_train_Y[i]) for i in range(len(var_train_X))]
            #error = [(train_predicted[i]-var_train_Y[i])**2 for i in range(len(var_train_X))]
            average_error = sum(error)/len(var_train_X)
            train_accuracy = average_error
        train_accuracies.append(train_accuracy)
        
        if not regress:
            test_predicted = list_map(model.infer_class, var_test_X)
        else:
            test_predicted = list_map(model.forward, var_test_X)
        
        if not regress:
            matches = [test_predicted[i]==var_test_Y[i] for i in range(len(var_test_X))]
            matches = list_map(int, matches)
            test_accuracy = sum(matches)/len(var_test_X)
        else:
            error = [abs(test_predicted[i]-var_test_Y[i]) for i in range(len(var_test_X))]
            #error = [(test_predicted[i]-var_test_Y[i])**2 for i in range(len(var_test_X))]
            average_error = sum(error)/len(var_test_X)
            test_accuracy = average_error
        test_accuracies.append(test_accuracy)
    
    if not regress:
        plt.plot(range(len(train_accuracies)), train_accuracies, label="training_accuracy")
        plt.plot(range(len(test_accuracies)), test_accuracies, label="testing_accuracy")
        plt.xlabel('Passes through data')
        plt.ylabel('Accuracy (higher is better)')
        plt.title('Classification with a Neural Network')
    else:
        plt.plot(range(len(train_accuracies)), train_accuracies, label="training_accuracy")
        plt.plot(range(len(test_accuracies)), test_accuracies, label="testing_accuracy")
        plt.xlabel('Passes through data')
        plt.ylabel('Average Error (lower is better)')
        plt.title('Regression with a Neural Network')
    plt.legend()
    plt.show()

learn_on_data(class_train_X, class_train_Y, class_test_X, class_test_Y, regress=False)
learn_on_data(regress_train_X, regress_train_Y, regress_test_X, regress_test_Y, regress=True)


# A note about saving:
For production, you'll generally want to save your most accurate model. Although that isn't demonstrated above, the commands are really simple:
`torch.save(model, file_handle)`
and
`model = torch.load(file_handle)`