In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
import umap
import hdbscan

  from .autonotebook import tqdm as notebook_tqdm


### Wczytanie danych

In [2]:
hc_training = pd.read_csv('../../../classifier_data/HC_training.csv', sep='\t').T
hc_test = pd.read_csv('../../../classifier_data/HC_test.csv', sep='\t').T
oc_training = pd.read_csv('../../../classifier_data/OC_training.csv', sep='\t').T
oc_test = pd.read_csv('../../../classifier_data/OC_test.csv', sep='\t').T

synthetic_hc_data_path = '../../../best_models/HC/WGANWC/50/synthetic_data/'
model_id = os.listdir(synthetic_hc_data_path)[0]
hc_synthetic = pd.read_csv(f'{synthetic_hc_data_path}/{model_id}/generated_data.tsv', sep='\t')

synthetic_oc_data_path = '../../../best_models/OC/WGANWC/50/synthetic_data/'
model_id = os.listdir(synthetic_oc_data_path)[0]
oc_synthetic = pd.read_csv(f'{synthetic_oc_data_path}/{model_id}/generated_data.tsv', sep='\t')

# combine training and test data with labels
hc_training['label'] = 'HC'
hc_test['label'] = 'HC'
oc_training['label'] = 'OC'
oc_test['label'] = 'OC'

hc_synthetic['label'] = 'HC'
oc_synthetic['label'] = 'OC'

training_data = pd.concat([hc_training, oc_training])
test_data = pd.concat([hc_test, oc_test])

In [3]:
# keep only the columns that are in both datasets
# Znalezienie wspólnych kolumn
common_columns = training_data.columns.intersection(test_data.columns)
common_columns = common_columns.intersection(hc_synthetic.columns)
common_columns = common_columns.intersection(oc_synthetic.columns)

# Uporządkowanie kolumn w real_data
training_data = training_data[common_columns]

# Uporządkowanie kolumn w synthetic_data
test_data = test_data[common_columns]

# Uporządkowanie kolumn w synthetic_data
hc_synthetic = hc_synthetic[common_columns]
oc_synthetic = oc_synthetic[common_columns]

common_columns

Index(['ENSG00000019582', 'ENSG00000054654', 'ENSG00000076928',
       'ENSG00000081237', 'ENSG00000085265', 'ENSG00000090382',
       'ENSG00000106066', 'ENSG00000110719', 'ENSG00000115523',
       'ENSG00000119535', 'ENSG00000120129', 'ENSG00000121316',
       'ENSG00000135046', 'ENSG00000136167', 'ENSG00000137642',
       'ENSG00000147168', 'ENSG00000160255', 'ENSG00000163131',
       'ENSG00000177359', 'ENSG00000184319', 'ENSG00000196126',
       'ENSG00000198336', 'ENSG00000211899', 'ENSG00000240356',
       'ENSG00000244734', 'ENSG00000257207', 'ENSG00000269028', 'label'],
      dtype='object')

In [4]:
# add synthetic data to training data
training_data_synthetic = pd.concat([training_data, hc_synthetic, oc_synthetic])
training_data_synthetic

Unnamed: 0,ENSG00000019582,ENSG00000054654,ENSG00000076928,ENSG00000081237,ENSG00000085265,ENSG00000090382,ENSG00000106066,ENSG00000110719,ENSG00000115523,ENSG00000119535,...,ENSG00000177359,ENSG00000184319,ENSG00000196126,ENSG00000198336,ENSG00000211899,ENSG00000240356,ENSG00000244734,ENSG00000257207,ENSG00000269028,label
VUMC-HC-0033-TR2591,7.136091,5.318681,5.395822,6.121758,4.078496,4.643048,4.211676,4.169415,5.849773,4.399111,...,3.083092,3.535280,4.029213,7.670810,4.911210,3.535280,6.995484,9.847484,7.757440,HC
Vumc-HD-70-TR1062,9.178061,6.106566,6.461376,9.448584,4.926369,5.721557,4.653637,5.309622,7.717082,4.219669,...,5.079361,3.701701,5.786592,7.237932,6.633504,3.701701,10.021953,8.262227,6.645036,HC
VUMC-HC0053-DOT-HD-48h-TR3087,7.631874,5.383165,5.975680,7.317863,4.845811,4.997802,3.083092,5.151550,5.593316,3.083092,...,3.083092,6.754951,4.529534,3.720811,4.559495,7.783065,7.050183,8.831693,4.253528,HC
Vumc-HD-149-TR932,6.897237,3.647372,5.524362,5.552469,4.492627,4.275620,4.459443,3.544794,5.451417,3.083092,...,3.544794,8.271851,4.616154,3.083092,4.389800,9.503594,8.661739,3.647372,8.517492,HC
Vumc-HD-36-TR1165,8.796382,7.775699,7.918580,9.002749,6.525333,5.811822,4.590595,7.579274,7.306126,5.743631,...,4.172174,4.893418,6.813860,7.438673,6.525333,5.671874,9.877834,3.862107,7.025844,HC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,8.888708,8.090477,7.922525,9.895784,6.740036,5.119778,5.018567,8.053417,10.021500,8.747166,...,5.233642,5.039174,5.628789,3.545953,7.482005,5.901118,10.117565,6.899625,4.836991,OC
77,7.544620,5.185695,7.369085,6.359756,5.871343,5.101378,4.055549,6.362377,4.800881,5.936649,...,5.750458,4.229492,5.029769,6.885035,5.450448,5.432720,11.658381,5.551944,4.904979,OC
78,6.782947,4.485204,5.373574,5.299162,3.838662,5.202899,3.164187,4.585936,4.995075,3.850511,...,4.771267,4.585531,4.164749,3.177530,4.042806,5.599399,7.654357,5.596421,5.472971,OC
79,5.293296,3.325271,3.979018,4.388884,3.896513,4.685700,3.159561,3.450003,6.120191,3.418393,...,5.157081,5.526205,3.641196,3.101383,3.322173,6.602656,9.321873,7.935718,8.114061,OC


### Wykorzystanie klasyfikatorów

In [5]:
# klasyfikacja z uyciem svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


def svm_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Klasyfikacja przy użyciu SVM
    svm = SVC(kernel='linear', random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))


svm_classification(training_data, test_data)
svm_classification(training_data_synthetic, test_data)

Dokładność: 0.8162162162162162
              precision    recall  f1-score   support

          HC       0.83      0.94      0.88       131
          OC       0.78      0.52      0.62        54

    accuracy                           0.82       185
   macro avg       0.80      0.73      0.75       185
weighted avg       0.81      0.82      0.80       185

[[123   8]
 [ 26  28]]
Dokładność: 0.8378378378378378
              precision    recall  f1-score   support

          HC       0.87      0.91      0.89       131
          OC       0.75      0.67      0.71        54

    accuracy                           0.84       185
   macro avg       0.81      0.79      0.80       185
weighted avg       0.83      0.84      0.83       185

[[119  12]
 [ 18  36]]


In [6]:
from sklearn.tree import DecisionTreeClassifier
# klasyfikacja z uyciem random forest
def decision_tree_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    dt = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

decision_tree_classification(training_data, test_data)
decision_tree_classification(training_data_synthetic, test_data)

Dokładność: 0.7567567567567568
              precision    recall  f1-score   support

          HC       0.79      0.90      0.84       131
          OC       0.63      0.41      0.49        54

    accuracy                           0.76       185
   macro avg       0.71      0.65      0.67       185
weighted avg       0.74      0.76      0.74       185

[[118  13]
 [ 32  22]]
Dokładność: 0.745945945945946
              precision    recall  f1-score   support

          HC       0.80      0.85      0.83       131
          OC       0.58      0.48      0.53        54

    accuracy                           0.75       185
   macro avg       0.69      0.67      0.68       185
weighted avg       0.74      0.75      0.74       185

[[112  19]
 [ 28  26]]


In [7]:
from sklearn.ensemble import RandomForestClassifier
# klasyfikacja z uyciem random forest
def random_forest_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

random_forest_classification(training_data, test_data)
random_forest_classification(training_data_synthetic, test_data)

Dokładność: 0.7675675675675676
              precision    recall  f1-score   support

          HC       0.77      0.95      0.85       131
          OC       0.74      0.31      0.44        54

    accuracy                           0.77       185
   macro avg       0.76      0.63      0.65       185
weighted avg       0.76      0.77      0.73       185

[[125   6]
 [ 37  17]]
Dokładność: 0.7837837837837838
              precision    recall  f1-score   support

          HC       0.79      0.95      0.86       131
          OC       0.75      0.39      0.51        54

    accuracy                           0.78       185
   macro avg       0.77      0.67      0.69       185
weighted avg       0.78      0.78      0.76       185

[[124   7]
 [ 33  21]]


In [8]:
from xgboost import XGBClassifier

# klasyfikacja z uyciem xgboost
def xgboost_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    y_train = y_train.replace('HC', 0)
    y_train = y_train.replace('OC', 1)
    y_test = y_test.replace('HC', 0)
    y_test = y_test.replace('OC', 1)
    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

xgboost_classification(training_data, test_data)
xgboost_classification(training_data_synthetic, test_data)

Dokładność: 0.8
              precision    recall  f1-score   support

           0       0.81      0.93      0.87       131
           1       0.74      0.48      0.58        54

    accuracy                           0.80       185
   macro avg       0.78      0.71      0.73       185
weighted avg       0.79      0.80      0.79       185

[[122   9]
 [ 28  26]]
Dokładność: 0.8162162162162162
              precision    recall  f1-score   support

           0       0.84      0.91      0.88       131
           1       0.73      0.59      0.65        54

    accuracy                           0.82       185
   macro avg       0.79      0.75      0.76       185
weighted avg       0.81      0.82      0.81       185

[[119  12]
 [ 22  32]]


In [9]:
from sklearn.neural_network import MLPClassifier

# klasyfikacja z uzyciem sieci neuronowych

def nn_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    nn = MLPClassifier(hidden_layer_sizes=(100,100), random_state=42)
    nn.fit(X_train, y_train)
    y_pred = nn.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

nn_classification(training_data, test_data)
nn_classification(training_data_synthetic, test_data)



Dokładność: 0.8108108108108109
              precision    recall  f1-score   support

          HC       0.85      0.89      0.87       131
          OC       0.70      0.61      0.65        54

    accuracy                           0.81       185
   macro avg       0.77      0.75      0.76       185
weighted avg       0.81      0.81      0.81       185

[[117  14]
 [ 21  33]]
Dokładność: 0.8054054054054054
              precision    recall  f1-score   support

          HC       0.85      0.89      0.87       131
          OC       0.69      0.61      0.65        54

    accuracy                           0.81       185
   macro avg       0.77      0.75      0.76       185
weighted avg       0.80      0.81      0.80       185

[[116  15]
 [ 21  33]]
