In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import neuralNet
import numpy as np
from sklearn import datasets
import random
import homology as hm
import util
import datetime

import importlib

# Neural Network

This notebook will contain an example use of the PersLay implementation. The task here is to distinguish whether a dataset is generated by ```make_blobs()``` or ```make_circles()```.

As a first step we create our labeled data:

In [51]:
# Create datasets labeled
def create_datasets(train_entries, test_entries):
    train_labels = []
    train_dataset = []
    test_labels = []
    test_dataset = []

    for i in range(train_entries):
        if random.random() >= .5:
            circles, _ = datasets.make_circles(n_samples=20, random_state=42, factor=.5)
            train_labels.append(0)

            distances = util.create_distance_matrix(circles)
            diagram, l = hm.rips(distances, dimensions=1)
            diagram = hm.filtered_complexes_to_tuples(diagram, l)
            diagram[diagram == np.inf] = 10

            train_dataset.append(diagram)

        else:
            blobs, _ = datasets.make_blobs(n_samples=20, random_state=42, center_box=(1, 1), centers=3)
            train_labels.append(1)

            distances = util.create_distance_matrix(blobs)
            diagram, l = hm.rips(distances, dimensions=1)
            diagram = hm.filtered_complexes_to_tuples(diagram, l)
            diagram[diagram == np.inf] = 10

            train_dataset.append(diagram)

        if i % 10 == 0:
            print(f"{i + 1} out of {train_entries} datasets created.")

    for i in range(test_entries):
        if random.random() >= .5:
            circles, _ = datasets.make_circles(n_samples=20, random_state=42, factor=.5)
            test_labels.append(0)

            distances = util.create_distance_matrix(circles)
            diagram, l = hm.rips(distances, dimensions=1)
            diagram = hm.filtered_complexes_to_tuples(diagram, l)
            diagram[diagram == np.inf] = 10

            test_dataset.append(diagram)

        else:
            blobs, _ = datasets.make_blobs(n_samples=20, random_state=42, center_box=(1, 1), centers=3)
            test_labels.append(1)

            distances = util.create_distance_matrix(blobs)
            diagram, l = hm.rips(distances, dimensions=1)
            diagram = hm.filtered_complexes_to_tuples(diagram, l)
            diagram[diagram == np.inf] = 10

            test_dataset.append(diagram)

        if i % 10 == 0:
            print(f"{i + 1} out of {test_entries} datasets created.")

    return train_dataset, train_labels, test_dataset, test_labels


train_dataset, train_labels, test_dataset, test_labels = create_datasets(200,25)

1 out of 200 datasets created.
11 out of 200 datasets created.
21 out of 200 datasets created.
31 out of 200 datasets created.
41 out of 200 datasets created.
51 out of 200 datasets created.
61 out of 200 datasets created.
71 out of 200 datasets created.
81 out of 200 datasets created.
91 out of 200 datasets created.
101 out of 200 datasets created.
111 out of 200 datasets created.
121 out of 200 datasets created.
131 out of 200 datasets created.
141 out of 200 datasets created.
151 out of 200 datasets created.
161 out of 200 datasets created.
171 out of 200 datasets created.
181 out of 200 datasets created.
191 out of 200 datasets created.
1 out of 25 datasets created.
11 out of 25 datasets created.
21 out of 25 datasets created.


We create a Dataset instance to use the data with the Pytorch dataloader:

In [191]:
class TorchDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.unsqueeze(torch.tensor(data),1)
        print(self.data.shape)
        self.labels = torch.tensor(labels)
        print(self.labels.shape)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

train_data = TorchDataset(train_dataset, train_labels)
test_data = TorchDataset(test_dataset, test_labels)

torch.Size([200, 1, 191, 2])
torch.Size([200])
torch.Size([25, 1, 191, 2])
torch.Size([25])


Then we need a function to train our PersLay:

In [273]:
# Altered this function from the deep learning course. It trains a Pytorch model using a given optimizer and data_loader
def train(n_epochs, optimizer, model, loss_fn, train_loader):
    n_batch = len(train_loader)
    losses_train = []
    model.train()
    optimizer.zero_grad(set_to_none=True)

    for epoch in range(1, n_epochs + 1):

        loss_train = 0.0
        for x, labels in train_loader:

            x = x.to(dtype=torch.double)
            labels = labels
            outputs = model(x)

            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            loss_train += loss.item()

        losses_train.append(loss_train / n_batch)

        print('{}  |  Epoch {}  |  Training loss {:.5f}'.format(
            datetime.datetime.now().time(), epoch, loss_train / n_batch))

Initialize all variables:

In [274]:
importlib.reload(neuralNet)  # For debugging, reload the import each run

model = neuralNet.PersLay(2)
train_loader = DataLoader(train_data, batch_size=20, shuffle=True)
optimizer = optim.SGD(model.parameters(), lr=.01)
n_epochs = 10
loss_fn = nn.CrossEntropyLoss()

In [275]:
train(n_epochs, optimizer, model, loss_fn, train_loader)  # Train the model

00:32:39.362918  |  Epoch 1  |  Training loss 0.69183
00:32:39.394907  |  Epoch 2  |  Training loss 0.68165
00:32:39.425898  |  Epoch 3  |  Training loss 0.67964
00:32:39.451888  |  Epoch 4  |  Training loss 0.67817
00:32:39.482878  |  Epoch 5  |  Training loss 0.67620
00:32:39.523866  |  Epoch 6  |  Training loss 0.66669
00:32:39.561853  |  Epoch 7  |  Training loss 0.66497
00:32:39.599840  |  Epoch 8  |  Training loss 0.65778
00:32:39.628831  |  Epoch 9  |  Training loss 0.65446
00:32:39.657823  |  Epoch 10  |  Training loss 0.64973


Now we see the model does learn a little bit, but not too much, maybe because the dataset isn't all that big, maybe because we only use 1 PersLay, with no additional neural network layers.

In [276]:
def compute_accuracy(model, loader): # Again an altered function from the deep learning course.
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data, labels in loader:
            data = data.to(dtype=torch.double)
            labels = labels
            outputs = model(data)
            _, predicted = torch.max(outputs, dim=1)
            total += labels.shape[0]
            correct += int((predicted == labels).sum())

    acc =  correct / total
    print("Accuracy: {:.2f}".format(acc))
    return acc

In [277]:
test_loader = DataLoader(test_data, batch_size=25, shuffle=True)

Now we calculate the accuracy of the model:

In [278]:
compute_accuracy(model, test_loader)

Accuracy: 0.56


0.56

It's not great, but at least a bit better than flipping a coin.