# Importing packages

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
from google.colab import files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Installing UGent Panno font

Before running there are a few steps that need to be taken.

Download this: https://www.ugent.be/intranet/nl/op-het-werk/communicatie/huisstijl-presentaties/huisstijl/panno-text.zip
Unzip 'panno-text.zip'

Open and unzip '150831-ugentpannotext-v300-truetype.zip'

Upload the 'UGentPannoText-Normal' to this drive in the local directory!!!! (on the left bar, click the folder-icon, upload the 'UGentPannoText-Normal')

In [None]:
#!mv UGentPannoText-Normal.ttf /usr/share/fonts/truetype/
font_files = fm.findSystemFonts('.')

for font_file in font_files:
    fm.fontManager.addfont(font_file)
print(font_files)
plt.rc('font', family='UGent Panno Text')
#path = '/usr/share/fonts/truetype/UGentPannoText-Normal.ttf'
#fontprop = fm.FontProperties(fname=path)

['/content/UGentPannoText-Normal.ttf', '/content/UGentPannoText-Medium.ttf', '/content/UGentPannoText-SemiLight.ttf', '/content/UGentPannoText-SemiBold.ttf']


# Defining plot layout

In [None]:
plt.style.use('bmh')
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 10
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 10
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['figure.titlesize'] = 12

# Model definition

In [None]:
def FCNK(kmer, tax, epochsize):
    print(f'Initiating training, validation and testing on {tax} level with {kmer}.')

    class mydataset(Dataset):
        def __init__(self, x, y):
            self.x = torch.tensor(x, dtype=torch.float32, device='cpu')
            self.y = torch.tensor(y, dtype=torch.long, device='cpu')
            self.length = self.x.shape[0]

        def __getitem__(self, idx):
            return self.x[idx], self.y[idx]

        def __len__(self):
            return self.length

    class Net(nn.Module):
        def __init__(self, input_shape, output_shape, shape1, shape2):
            super(Net, self).__init__()
            self.fc1 = nn.Linear(input_shape, shape1)
            self.fc2 = nn.Linear(shape1, shape2)
            self.fcout = nn.Linear(shape2, output_shape)
            self.relu = nn.ReLU()

        def forward(self, x):
            out = self.fc1(x)
            out = self.relu(out)

            out = self.fc2(out)
            out = self.relu(out)

            out = self.fcout(out)
            return out

    # Load the existed Training & Validation & Testing Dataset
    base_path = '/content/drive/MyDrive/BachelorsProject/FinalModels/5AMBI/'
    
    # These files are all in my google drive
    TrainX = np.load(f'{base_path}Train_X_{kmer}1A.npy')
    TrainY = np.load(f'{base_path}Train_Y_{tax}1A.npy')
    TestX = np.load(f'{base_path}Test_X_{kmer}1A.npy')
    TestY = np.load(f'{base_path}Test_Y_{tax}1A.npy')
    ValX = np.load(f'{base_path}Validation_X_{kmer}1A.npy')
    ValY = np.load(f'{base_path}Validation_Y_{tax}1A.npy')
    print('Training, test and validation datasets are loaded...')

    batches = 100
    trainset = mydataset(TrainX, TrainY)
    valset = mydataset(ValX, ValY)
    testset = mydataset(TestX, TestY)
    trainloader = DataLoader(trainset, batch_size=batches, shuffle=True)
    valloader = DataLoader(valset, batch_size=batches, shuffle=False)
    testloader = DataLoader(testset, batch_size=batches, shuffle=False)
    print('Loading trainset, trainloader, testset, testloader ...')

    learning_rate = 0.001
    epochs = epochsize
    input_size = TrainX.shape[1]
    size1 = TrainX.shape[1] * 3 // 4
    size2 = TrainX.shape[1] * 1 // 4
    output_size = len(np.unique(TrainY))
    model = Net(input_shape=input_size, output_shape=output_size, shape1= size1, shape2= size2)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()
    print('Setting hyperparameters...')

    training_losses = []
    training_accuracies = []
    validation_losses = []
    validation_accuracies = []

    print('Training model...')
    for epoch in range(epochs):
        # Training loop
        model.train()
        training_loss = 0.0
        correct = 0
        total = 0
        for j, (x_train, y_train) in enumerate(trainloader):
            # calculate output
            output = model(x_train)

            # calculate loss
            loss = loss_fn(output, y_train)

            # backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate training loss and accuracy
            training_loss += loss.item() * x_train.size(0)
            output_tags = torch.argmax(output, dim=1)
            targets = y_train
            correct += (output_tags == targets).sum().item()
            total += y_train.size(0)

        # Print training statistics
        epoch_loss = training_loss / len(trainloader.dataset)
        epoch_acc = 100. * correct / total
        print(f'Epoch [{epoch + 1}] Training Loss: {epoch_loss:.4f}, Training Accuracy: {epoch_acc:.2f}%')

        # Store the training loss and training accuracy
        training_losses.append(epoch_loss)
        training_accuracies.append(epoch_acc)

        # Validation loop
        model.eval()
        validation_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for j, (x_val, y_val) in enumerate(valloader):
                output = model(x_val)
                loss = loss_fn(output, y_val)
                # Calculate validation loss and accuracy
                validation_loss += loss.item() * x_val.size(0)
                output_tags = torch.argmax(output, dim=1)
                targets = y_val
                correct += (output_tags == targets).sum().item()
                total += y_val.size(0)

        # Print validation statistics
        epoch_val_loss = validation_loss / len(valloader.dataset)
        epoch_val_acc = 100. * correct / total
        print(f'Epoch [{epoch + 1}] Validation Loss: {epoch_val_loss:.4f}, Validation Accuracy: {epoch_val_acc:.2f}%')

        # Store the validation loss and validation accuracy
        validation_losses.append(epoch_val_loss)
        validation_accuracies.append(epoch_val_acc)

    # Testing
    with torch.no_grad():
        y_pred = []
        y_true = []
        # simple accuracy as above
        for x_test, y_test in testloader:
            test_output = model(x_test)
            y_pred += torch.argmax(test_output, dim=1).tolist()
            y_true += y_test.tolist()
        report_dict = metrics.classification_report(y_true, y_pred, digits=3)
        print(report_dict)


    plt.plot(training_losses, label='Training', color='#1E64C8', linewidth=1)
    plt.plot(validation_losses, label='Validation', color='black', linewidth=1)
    plt.title(f'Training and Validation Loss of the FCN on {tax} level with {kmer}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss (in %)')
    plt.legend()
    plt.savefig(f'FCNK{tax}{kmer}Loss1A.svg')
    files.download(f'FCNK{tax}{kmer}Loss1A.svg') 
    plt.show()

    plt.plot(training_accuracies, label='Training', color='#1E64C8', linewidth=1)
    plt.plot(validation_accuracies, label='Validation', color='black', linewidth=1)
    plt.title(f'Training and Validation Accuracy of the FCN on {tax} level with {kmer}')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (in %)')
    plt.legend()
    plt.savefig(f'FCNK{tax}{kmer}Accuracy1A.svg')
    files.download(f'FCNK{tax}{kmer}Accuracy1A.svg') 
    plt.show()
    print(f'Training, validation and testing on {tax} level with {kmer} is completed.')

# 3mer

In [None]:
FCNK('3mer', 'PHYLUM', 200)

Initiating training, validation and testing on PHYLUM level with 3mer.
Training, test and validation datasets are loaded...
Loading trainset, trainloader, testset, testloader ...
Setting hyperparameters...
Training model...
Epoch [1] Training Loss: 1.0205, Training Accuracy: 59.73%
Epoch [1] Validation Loss: 0.7042, Validation Accuracy: 80.91%
Epoch [2] Training Loss: 0.4261, Training Accuracy: 86.87%
Epoch [2] Validation Loss: 0.2769, Validation Accuracy: 90.92%
Epoch [3] Training Loss: 0.2348, Training Accuracy: 92.69%
Epoch [3] Validation Loss: 0.1971, Validation Accuracy: 93.69%
Epoch [4] Training Loss: 0.1883, Training Accuracy: 93.73%
Epoch [4] Validation Loss: 0.1732, Validation Accuracy: 94.38%
Epoch [5] Training Loss: 0.1656, Training Accuracy: 94.45%
Epoch [5] Validation Loss: 0.1497, Validation Accuracy: 95.06%
Epoch [6] Training Loss: 0.1484, Training Accuracy: 95.02%
Epoch [6] Validation Loss: 0.1352, Validation Accuracy: 95.40%
Epoch [7] Training Loss: 0.1358, Training Ac

In [None]:
FCNK('3mer', 'CLASS', 200)

In [None]:
FCNK('3mer', 'ORDER', 200)

In [None]:
FCNK('3mer', 'FAMILY', 200)

In [None]:
FCNK('3mer', 'GENUS', 200)

# 4mer

In [None]:
FCNK('4mer', 'phylum', 200)

In [None]:
FCNK('4mer', 'class', 200)

In [None]:
FCNK('4mer', 'order', 200)

In [None]:
FCNK('4mer', 'family', 200)

In [None]:
FCNK('4mer', 'genus', 200)

# 5mer

In [None]:
FCNK('5mer', 'phylum', 25)

In [None]:
FCNK('5mer', 'class', 25)

In [None]:
FCNK('5mer', 'order', 25)

In [None]:
FCNK('5mer', 'family', 25)

In [None]:
FCNK('5mer', 'genus', 25)