In [1]:
import pandas as pd
import numpy as np

import imblearn.over_sampling as osa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import (NearestNeighbors, NeighborhoodComponentsAnalysis, KNeighborsClassifier)
from sklearn.pipeline import Pipeline
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest

import warnings
warnings.filterwarnings("ignore")

In [2]:
diseases = ['CAD', 'CKD', 'IBD', 'T2D']
levels = ['Class', 'Family' 'Genus', 'Order', 'Species']
nominals = {'healthy': 0, 'CAD': 3, 'CKD': 4, 'IBD': 2, 'T2D': 1}

def isct(A, B):
    return list(filter(lambda x: x in A, B))

def setup_df(level):
    D = {disease: pd.read_csv(disease + '\\' + level + disease + '_train.csv') for disease in diseases}
    for disease in diseases:
        X = D[disease]
        X.loc[X['label'] == 1, 'label'] = nominals[disease]
        #X.loc[X['label'] == 0, 'label'] = 'healthy'
    columns = isct(isct(isct(D['CAD'].columns, D['CKD'].columns), D['IBD'].columns), D['T2D'].columns)
    data = {col: pd.concat([D[disease][col] for disease in diseases]) for col in columns}
    return pd.DataFrame(data=data)

# level 'Genus', und 'Family' funktionieren am besten
df = setup_df(level='Family').drop(['sample_ID'], axis=1)
df.head()

Unnamed: 0,Bacteria;Abditibacteriota;Abditibacteria;Abditibacteriales;Abditibacteriaceae,Bacteria;Acidobacteriota;Aminicenantia;Aminicenantales;uncultured bacterium,Bacteria;Acidobacteriota;Blastocatellia;Blastocatellales;Blastocatellaceae,Bacteria;Actinobacteriota;Acidimicrobiia;Microtrichales;Ilumatobacteraceae,Bacteria;Actinobacteriota;Actinobacteria;Actinomycetales;Actinomycetaceae,Bacteria;Actinobacteriota;Actinobacteria;Bifidobacteriales;Bifidobacteriaceae,Bacteria;Actinobacteriota;Actinobacteria;Corynebacteriales;Corynebacteriaceae,Bacteria;Actinobacteriota;Actinobacteria;Corynebacteriales;Dietziaceae,Bacteria;Actinobacteriota;Actinobacteria;Corynebacteriales;Nocardiaceae,Bacteria;Actinobacteriota;Actinobacteria;Frankiales;Frankiaceae,...,Bacteria;Proteobacteria;Gammaproteobacteria;Xanthomonadales;Rhodanobacteraceae,Bacteria;Proteobacteria;Gammaproteobacteria;Xanthomonadales;Xanthomonadaceae,Bacteria;Spirochaetota;Brachyspirae;Brachyspirales;Brachyspiraceae,Bacteria;Spirochaetota;Spirochaetia;Spirochaetales;Spirochaetaceae,Bacteria;Synergistota;Synergistia;Synergistales;Synergistaceae,Bacteria;Verrucomicrobiota;Lentisphaeria;Victivallales;vadinBE97,Bacteria;Verrucomicrobiota;Verrucomicrobiae;Opitutales;Puniceicoccaceae,Bacteria;Verrucomicrobiota;Verrucomicrobiae;Verrucomicrobiales;Akkermansiaceae,Bacteria;Verrucomicrobiota;Verrucomicrobiae;Verrucomicrobiales;Rubritaleaceae,label
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [3]:
def split(df):
    Y = df.loc[:, 'label']
    X = df.iloc[:, :(len(df.columns) - 1)]
    # return (X_train, X_test, Y_train, Y_test)
    return train_test_split(X, Y, test_size=0.2, random_state=42)

def oversample(X_train, Y_train):
    ada = osa.ADASYN()
    # return (X_resampled, Y_resampled)
    return ada.fit_sample(X_train, Y_train)

def scale(X, scaler):
    return pd.DataFrame(scaler.transform(X), columns=X.columns)

def normalise(X_resampled):
    min_max_scaler = MinMaxScaler()
    min_max_scaler.fit(X_resampled)
    X_resampled = scale(X_resampled, min_max_scaler)
    return X_resampled, min_max_scaler

In [4]:
# K-Nearest-Neighbors
# osa=True => oversampling
def knn(df, k, osa=False, metric='manhattan'):
    X_train, X_test, Y_train, Y_test = split(df)
    if osa:
        X_train, Y_train = oversample(X_train, Y_train)
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
    knn.fit(X_train, Y_train)
    print(f'Accuracy: {knn.score(X_test, Y_test)}')
    Y_pred = knn.predict(X_test)
    print('Confusion Matrix:')
    print(confusion_matrix(Y_test, Y_pred))

def knn_nca(df, k, osa=False, metric='manhattan'):
    X_train, X_test, Y_train, Y_test = split(df)
    if osa:
        X_train, Y_train = oversample(X_train, Y_train)
    nca = NeighborhoodComponentsAnalysis(random_state=42)
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
    pipe = Pipeline(steps=[('nca', nca), ('knn', knn)])
    pipe.fit(X_train, Y_train)
    print(f'Accuracy: {pipe.score(X_test, Y_test)}')
    Y_pred = pipe.predict(X_test)
    print(confusion_matrix(Y_test, Y_pred))

def knn_skb(df, k, osa=False, metric='manhattan'):
    X_train, X_test, Y_train, Y_test = split(df)
    if osa:
        X_train, Y_train = oversample(X_train, Y_train)
    knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
    skb = SelectKBest(k=len(X_train.columns))
    pipe = Pipeline(steps=[('skb', skb), ('knn', knn)])
    pipe.fit(X_train, Y_train)
    print(f'Accuracy: {pipe.score(X_test, Y_test)}')
    Y_pred = pipe.predict(X_test)
    print(confusion_matrix(Y_test, Y_pred))

In [5]:
# k=1 oder k=2 funktioniert gut, metric='manhattan', osa=True
for k in range(1, 3):
    print(f'k={k}, K-Nearest-Neighbor')
    knn(df=df, k=k)
    knn(df=df, k=k, osa=True)
    print(f'\nk={k}, K-Nearest-Neighbor + Neighborhood-Component-Analysis')
    knn_nca(df=df, k=k)
    knn_nca(df=df, k=k, osa=True)
    print(f'\nk={k}, K-Nearest-Neighbor + Select-K-Best')
    knn_skb(df=df, k=k)
    knn_skb(df=df, k=k, osa=True)
    print()

k=1, K-Nearest-Neighbor
Accuracy: 0.8264758497316637
Confusion Matrix:
[[420   4   0   0   0]
 [ 52  39   4   1   1]
 [ 10   4   1   0   3]
 [  8   4   0   1   0]
 [  4   2   0   0   1]]
Accuracy: 0.8318425760286225
Confusion Matrix:
[[418   0   4   2   0]
 [ 30  42  13   6   6]
 [  6   4   3   1   4]
 [  2   6   1   1   3]
 [  3   3   0   0   1]]

k=1, K-Nearest-Neighbor + Neighborhood-Component-Analysis
Accuracy: 0.8318425760286225
[[422   2   0   0   0]
 [ 46  41   7   1   2]
 [  9   7   1   0   1]
 [  7   5   0   0   1]
 [  4   2   0   0   1]]
Accuracy: 0.8282647584973166
[[418   0   4   2   0]
 [ 30  42  12   7   6]
 [  6   5   2   1   4]
 [  2   6   3   0   2]
 [  3   2   1   0   1]]

k=1, K-Nearest-Neighbor + Select-K-Best
Accuracy: 0.8264758497316637
[[420   4   0   0   0]
 [ 52  39   4   1   1]
 [ 10   4   1   0   3]
 [  8   4   0   1   0]
 [  4   2   0   0   1]]
Accuracy: 0.8264758497316637
[[418   2   2   2   0]
 [ 28  41  16   5   7]
 [  5   5   2   1   5]
 [  3   5   3   0

In [6]:
# support vector machine und random forest funktionieren nicht so gut


# k nearest neighbors nach Entfernen von Korrelationen liefert keine besseren Ergebnisse als ohne Entfernen
def df_decor(df):
    cor = df.corr().abs()
    upper_tri = cor.where(np.triu(np.ones(cor.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.5)]
    return df.drop(to_drop, axis=1)

# df_new = df_decor(df=df)
# knn(df=df_new, k=2)