In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from torch import nn, tensor, utils, optim, no_grad, Tensor
import torch

from micropyome.taxa import regression
from micropyome.datasets import normalize

DATASET_PATH = "data/averill_processed/bacteria/"
X = {}
Y = {}
for level in regression.TAXONOMIC_LEVELS:
    x = pd.read_csv(f"{DATASET_PATH}{level}/15_variables.csv")
    if level == 'fg':
        y = pd.read_csv(f"{DATASET_PATH}{level}/observed.csv")
    else:
        y = pd.read_csv(f"{DATASET_PATH}{level}/y_11groupTaxo.csv")

    x = x.drop(x.columns[0], axis=1)
    x = x.drop("longitude", axis=1)
    x = normalize(x)
    y = y.drop(y.columns[0], axis=1)

    X[level] = x
    Y[level] = y

In [73]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    X["fg"].values, Y["fg"].values, test_size=0.2
)

In [74]:
class AutoEncoder(torch.nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(n_features, n_features),
            nn.ReLU(),
            nn.Linear(n_features, 9),
            nn.ReLU(),
            nn.Linear(9, 6),
            nn.LeakyReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(6, 9),
            nn.LeakyReLU(),
            nn.Linear(9, n_features),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [75]:
EPOCHS = 200

autoencoder = AutoEncoder(len(x_train[0]))
loss_function = nn.MSELoss()
optimizer = optim.Adagrad(autoencoder.parameters(), lr=1e-4)

trainloader = utils.data.DataLoader(
    [(x, y) for x, y in zip(x_train, y_train)], batch_size=16
)
testloader = utils.data.DataLoader(
    [(x, y) for x, y in zip(x_test, y_test)], batch_size=16
)
for epoch in range(EPOCHS):
    current_loss = 0.0
    for i, (inputs, _) in enumerate(trainloader, 0):
        inputs = inputs.float()
        optimizer.zero_grad()
        encoded, decoded = autoencoder(inputs)
        loss = loss_function(decoded, inputs)  # Auto-encoding
        loss.backward()
        optimizer.step()
        current_loss += loss.item()
        if i%100 == 0:
            print(f'Loss after mini-batch %5d: %.3f'%(i+1, current_loss))
            current_loss = 0.0
    print(f'Epoch {epoch+1} finished')

print("Training has completed")

Loss after mini-batch     1: 0.070
Epoch 1 finished
Loss after mini-batch     1: 0.070
Epoch 2 finished
Loss after mini-batch     1: 0.070
Epoch 3 finished
Loss after mini-batch     1: 0.070
Epoch 4 finished
Loss after mini-batch     1: 0.070
Epoch 5 finished
Loss after mini-batch     1: 0.070
Epoch 6 finished
Loss after mini-batch     1: 0.070
Epoch 7 finished
Loss after mini-batch     1: 0.070
Epoch 8 finished
Loss after mini-batch     1: 0.070
Epoch 9 finished
Loss after mini-batch     1: 0.070
Epoch 10 finished
Loss after mini-batch     1: 0.070
Epoch 11 finished
Loss after mini-batch     1: 0.070
Epoch 12 finished
Loss after mini-batch     1: 0.070
Epoch 13 finished
Loss after mini-batch     1: 0.070
Epoch 14 finished
Loss after mini-batch     1: 0.070
Epoch 15 finished
Loss after mini-batch     1: 0.069
Epoch 16 finished
Loss after mini-batch     1: 0.069
Epoch 17 finished
Loss after mini-batch     1: 0.069
Epoch 18 finished
Loss after mini-batch     1: 0.069
Epoch 19 finished
Lo

In [76]:
autoencoder.eval()
with no_grad():
    encoded, outputs = autoencoder(Tensor(x_test))
    predicted_labels = outputs.squeeze().tolist()

for i, o in zip(predicted_labels[0], x_test[0]):
    print(f"{i:.4}    {o:.4}        {abs(i-o):.4}")


0.4546    0.7928        0.3382
0.5561    0.2487        0.3074
0.4952    0.0        0.4952
0.4949    0.3248        0.1701
0.4357    0.5574        0.1217
0.5524    0.3769        0.1755
0.5156    0.497        0.01859
0.5013    0.09        0.4113
0.5348    0.05604        0.4787
0.4463    0.08617        0.3602
0.4573    0.5932        0.1359
0.4954    0.6735        0.1781
0.4835    0.7968        0.3133
0.5129    0.4418        0.07109


In [77]:
from torch import no_grad, Tensor
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(x_test, predicted_labels)
r2 = r2_score(x_test, predicted_labels)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)

Mean Squared Error: 0.07702980850449663
R2 Score: -1.1041962995204788


In [78]:
class Predictor(torch.nn.Module):
    def __init__(self, n_inputs, n_features):
        super().__init__()
        self.predictor = nn.Sequential(
            nn.Linear(n_inputs, 9),
            nn.ReLU(),
            nn.Linear(9, 9),
            nn.ReLU(),
            nn.Linear(9, n_features),
            nn.Softmax()
        )

    def forward(self, x):
        return self.predictor(x)

In [79]:
EPOCHS = 200

predictor = Predictor(len(encoded[0]), len(y_train[0]))
loss_function = nn.MSELoss()
optimizer = optim.Adagrad(predictor.parameters(), lr=1e-4)

trainloader = utils.data.DataLoader(
    [(x, y) for x, y in zip(encoded, y_train)], batch_size=16
)

for epoch in range(EPOCHS):
    current_loss = 0.0
    for i, (inputs, y) in enumerate(trainloader, 0):
        inputs, y = inputs.float(), y.float()
        optimizer.zero_grad()
        prediction = predictor(inputs)
        loss = loss_function(prediction, y)
        loss.backward()
        optimizer.step()
        current_loss += loss.item()
        if i%100 == 0:
            print(f'Loss after mini-batch %5d: %.3f'%(i+1, current_loss))
            current_loss = 0.0
    print(f'Epoch {epoch+1} finished')

print("Training has completed")

  return self._call_impl(*args, **kwargs)


Loss after mini-batch     1: 0.052
Epoch 1 finished
Loss after mini-batch     1: 0.052
Epoch 2 finished
Loss after mini-batch     1: 0.052
Epoch 3 finished
Loss after mini-batch     1: 0.052
Epoch 4 finished
Loss after mini-batch     1: 0.052
Epoch 5 finished
Loss after mini-batch     1: 0.052
Epoch 6 finished
Loss after mini-batch     1: 0.052
Epoch 7 finished
Loss after mini-batch     1: 0.052
Epoch 8 finished
Loss after mini-batch     1: 0.052
Epoch 9 finished
Loss after mini-batch     1: 0.052
Epoch 10 finished
Loss after mini-batch     1: 0.052
Epoch 11 finished
Loss after mini-batch     1: 0.052
Epoch 12 finished
Loss after mini-batch     1: 0.052
Epoch 13 finished
Loss after mini-batch     1: 0.052
Epoch 14 finished
Loss after mini-batch     1: 0.052
Epoch 15 finished
Loss after mini-batch     1: 0.052
Epoch 16 finished
Loss after mini-batch     1: 0.052
Epoch 17 finished
Loss after mini-batch     1: 0.052
Epoch 18 finished
Loss after mini-batch     1: 0.052
Epoch 19 finished
Lo

In [80]:
predictor.eval()
with no_grad():
    predictions = predictor(Tensor(encoded))
    predicted_labels = predictions.squeeze().tolist()

for i, o in zip(predicted_labels[0], y_test[0]):
    print(f"{i:.4}    {o:.4}        {abs(i-o):.4}")

0.06491    0.83        0.7651
0.06485    0.003182        0.06167
0.08993    0.01156        0.07837
0.05885    0.00681        0.05204
0.07233    0.003249        0.06908
0.05857    0.03732        0.02126
0.05981    0.01776        0.04205
0.07393    0.01831        0.05563
0.06874    0.02216        0.04658
0.05719    0.003617        0.05357
0.09101    0.0009327        0.09008
0.08145    0.02045        0.061
0.08368    0.001876        0.08181
0.07475    0.02274        0.052


In [81]:
mse = mean_squared_error(y_test, predicted_labels)
r2 = r2_score(y_test, predicted_labels)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)

Mean Squared Error: 0.05032434526618302
R2 Score: -2600.2380080626176
