In [1]:
import torch
import pandas
from torch.utils.data import Dataset

In [2]:
class MushroomDataset(Dataset):
    def __init__(self):
         self.data = pandas.read_csv('mushrooms.csv')
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data.iloc[idx][0:1]

In [3]:
shrooms = MushroomDataset()
len(shrooms), shrooms[0]

(8124,
 class    p
 Name: 0, dtype: object)

In [2]:
class MushroomDataset(Dataset):
    def __init__(self):
         self.data = pandas.read_csv('mushrooms.csv')
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        if type(idx) is torch.Tensor:
            idx = idx.item()
        return self.data.iloc[idx][1:], self.data.iloc[idx][0:1]

In [3]:
shrooms = MushroomDataset()
shrooms[0]

(cap-shape                   x
 cap-surface                 s
 cap-color                   n
 bruises                     t
 odor                        p
 gill-attachment             f
 gill-spacing                c
 gill-size                   n
 gill-color                  k
 stalk-shape                 e
 stalk-root                  e
 stalk-surface-above-ring    s
 stalk-surface-below-ring    s
 stalk-color-above-ring      w
 stalk-color-below-ring      w
 veil-type                   p
 veil-color                  w
 ring-number                 o
 ring-type                   p
 spore-print-color           k
 population                  s
 habitat                     u
 Name: 0, dtype: object,
 class    p
 Name: 0, dtype: object)

In [6]:
number_for_testing = int(len(shrooms) * 0.05)
number_for_training = len(shrooms) - number_for_testing

train, test = torch.utils.data.random_split(shrooms,
                                           [number_for_training, number_for_testing])
len(test), len(train)

(406, 7718)

In [7]:
test[0]

(cap-shape                   x
 cap-surface                 f
 cap-color                   n
 bruises                     f
 odor                        n
 gill-attachment             f
 gill-spacing                c
 gill-size                   n
 gill-color                  n
 stalk-shape                 e
 stalk-root                  e
 stalk-surface-above-ring    s
 stalk-surface-below-ring    s
 stalk-color-above-ring      w
 stalk-color-below-ring      w
 veil-type                   p
 veil-color                  w
 ring-number                 o
 ring-type                   p
 spore-print-color           n
 population                  v
 habitat                     u
 Name: 1026, dtype: object,
 class    e
 Name: 1026, dtype: object)

In [12]:
class OneHotEncoder():
    def __init__(self, series):
        unique_values = series.unique()
        self.ordinals = {
        val : i for i, val in enumerate(unique_values)
        }
        self.encoder = torch.eye(
        len(unique_values), len(unique_values)
        )
    def __getitem__(self, value):
        return self.encoder[self.ordinals[value]]

In [17]:
class CatagoricalCSV(Dataset):
    def __init__(self, datafile, output_series_name):
        self.dataset = pandas.read_csv(datafile)
        self.output_series_name = output_series_name
        self.encoders = {}
        for series_name, series in self.dataset.items():
            self.encoders[series_name] = OneHotEncoder(series)
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, index):
        if type(index) is torch.Tensor:
            index = index.item()
        sample = self.dataset.iloc[index]
        output = self.encoders[self.output_series_name][
            sample[self.output_series_name]
        ]
        input_components = []
        for name, value in sample.items():
            if name != self.output_series_name:
                input_components.append(
                self.encoders[name][value]
                )
        input = torch.cat(input_components)
        return input, output

In [18]:
shrooms = CatagoricalCSV('mushrooms.csv', 'class')
shrooms[0]

(tensor([1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
         0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
         1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 0.]),
 tensor([1., 0.]))

In [19]:
class Model(torch.nn.Module):
    def __init__(self, input_dimensions, output_dimensions, size = 128):
        super().__init__()
        self.layer_one = torch.nn.Linear(input_dimensions, size)
        self.activation_one = torch.nn.ReLU()
        self.layer_two = torch.nn.Linear(size, size)
        self.activation_two = torch.nn.ReLU()
        
        self.shape_outputs = torch.nn.Linear(size, output_dimensions)
        
        
    def forward(self, inputs):
        buffer = self.layer_one(inputs)
        buffer = self.activation_one(buffer)
        buffer = self.layer_two(buffer)
        buffer = self.activation_two(buffer)
        buffer = self.shape_outputs(buffer)
        return torch.nn.functional.softmax(buffer, dim=-1)


In [4]:
shrooms[0][0].shape[0]

22

In [20]:
model = Model(shrooms[0][0].shape[0], shrooms[0][1].shape[0])
optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.BCELoss()

In [21]:
number_for_testing = int(len(shrooms) * 0.05)
number_for_training = len(shrooms) - number_for_testing

train, test = torch.utils.data.random_split(shrooms,
                                           [number_for_training, number_for_testing])
len(test), len(train)

(406, 7718)

In [22]:
training = torch.utils.data.DataLoader(train, batch_size = 16, shuffle = True)

for epoch in range(4):
    for inputs, outputs in training:
        optimizer.zero_grad()
        results = model(inputs)
        loss = loss_function(results, outputs)
        loss.backward()
        optimizer.step()
        
    print("Loss: {0}".format(loss))

Loss: 1.1547460417205002e-05
Loss: 5.941860308666946e-06
Loss: 7.273612936842255e-06
Loss: 2.825445903908985e-07


In [26]:
import sklearn.metrics

In [27]:
testing = torch.utils.data.DataLoader(test, batch_size = len(test), shuffle=False)

for inputs, outputs in testing:
    results = model(inputs).argmax(dim=1).numpy()
    actual = outputs.argmax(dim=1).numpy()
    accuracy = sklearn.metrics.accuracy_score(actual, results)
    print(accuracy)

1.0


In [28]:
sklearn.metrics.confusion_matrix(actual, results)

array([[193,   0],
       [  0, 213]], dtype=int64)

In [29]:
print(sklearn.metrics.classification_report(actual, results))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       193
           1       1.00      1.00      1.00       213

    accuracy                           1.00       406
   macro avg       1.00      1.00      1.00       406
weighted avg       1.00      1.00      1.00       406

