In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
import umap
import hdbscan

  from .autonotebook import tqdm as notebook_tqdm


### Wczytanie danych

In [2]:
hc_training = pd.read_csv('../../../classifier_data/HC_training.csv', sep='\t').T
hc_test = pd.read_csv('../../../classifier_data/HC_test.csv', sep='\t').T
oc_training = pd.read_csv('../../../classifier_data/OC_training.csv', sep='\t').T
oc_test = pd.read_csv('../../../classifier_data/OC_test.csv', sep='\t').T

synthetic_hc_data_path = '../../../best_models/HC/WGANGP/2000/synthetic_data/'
model_id = os.listdir(synthetic_hc_data_path)[0]
hc_synthetic = pd.read_csv(f'{synthetic_hc_data_path}/{model_id}/generated_data.tsv', sep='\t')

synthetic_oc_data_path = '../../../best_models/OC/WGANGP/2000/synthetic_data/'
model_id = os.listdir(synthetic_oc_data_path)[0]
oc_synthetic = pd.read_csv(f'{synthetic_oc_data_path}/{model_id}/generated_data.tsv', sep='\t')

# combine training and test data with labels
hc_training['label'] = 'HC'
hc_test['label'] = 'HC'
oc_training['label'] = 'OC'
oc_test['label'] = 'OC'

hc_synthetic['label'] = 'HC'
oc_synthetic['label'] = 'OC'

training_data = pd.concat([hc_training, oc_training])
test_data = pd.concat([hc_test, oc_test])

In [3]:
# keep only the columns that are in both datasets
# Znalezienie wspólnych kolumn
common_columns = training_data.columns.intersection(test_data.columns)
common_columns = common_columns.intersection(hc_synthetic.columns)
common_columns = common_columns.intersection(oc_synthetic.columns)

# Uporządkowanie kolumn w real_data
training_data = training_data[common_columns]

# Uporządkowanie kolumn w synthetic_data
test_data = test_data[common_columns]

# Uporządkowanie kolumn w synthetic_data
hc_synthetic = hc_synthetic[common_columns]
oc_synthetic = oc_synthetic[common_columns]

common_columns

Index(['ENSG00000000419', 'ENSG00000000938', 'ENSG00000002549',
       'ENSG00000002822', 'ENSG00000003756', 'ENSG00000004487',
       'ENSG00000004534', 'ENSG00000005007', 'ENSG00000005059',
       'ENSG00000005302',
       ...
       'ENSG00000244734', 'ENSG00000249072', 'ENSG00000253729',
       'ENSG00000255823', 'ENSG00000257207', 'ENSG00000258890',
       'ENSG00000266714', 'ENSG00000269028', 'ENSG00000271043', 'label'],
      dtype='object', length=1548)

In [4]:
# add synthetic data to training data
training_data_synthetic = pd.concat([training_data, hc_synthetic, oc_synthetic])
training_data_synthetic

Unnamed: 0,ENSG00000000419,ENSG00000000938,ENSG00000002549,ENSG00000002822,ENSG00000003756,ENSG00000004487,ENSG00000004534,ENSG00000005007,ENSG00000005059,ENSG00000005302,...,ENSG00000244734,ENSG00000249072,ENSG00000253729,ENSG00000255823,ENSG00000257207,ENSG00000258890,ENSG00000266714,ENSG00000269028,ENSG00000271043,label
VUMC-HC-0033-TR2591,3.635790,4.432779,4.954165,3.720018,4.670109,4.211676,4.722403,6.139299,4.399111,4.169415,...,6.995484,6.002046,4.866702,6.095028,9.847484,4.211676,4.465409,7.757440,6.525607,HC
Vumc-HD-70-TR1062,4.546459,5.786592,6.072522,4.884990,5.279119,5.309622,5.479209,5.309622,5.828302,5.215895,...,10.021953,5.215895,6.019849,5.215895,8.262227,4.601320,4.139052,6.645036,5.397116,HC
VUMC-HC0053-DOT-HD-48h-TR3087,3.861049,5.526849,4.724280,4.253528,4.467036,4.292393,4.253528,5.631686,4.913257,4.498734,...,7.050183,3.861049,4.724280,3.861049,8.831693,3.861049,4.030369,4.253528,3.720811,HC
Vumc-HD-149-TR932,4.727528,4.923042,4.945413,3.937986,3.647372,4.727528,3.808559,5.031082,4.673182,4.829242,...,8.661739,6.247257,5.275333,6.360425,3.647372,3.083092,3.808559,8.517492,6.826289,HC
Vumc-HD-36-TR1165,4.402386,6.484772,4.751692,4.172174,4.402386,5.811822,7.053716,4.402386,5.998264,5.241598,...,9.877834,5.515912,6.055240,5.241598,3.862107,5.339481,6.355612,7.025844,5.339481,HC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,4.511887,5.555270,5.968326,4.345580,4.892912,5.096446,4.776956,4.801851,4.822116,4.658587,...,9.347366,4.970870,5.269019,5.334289,7.760506,4.508757,4.817557,5.900880,5.679082,OC
77,4.144344,4.514948,4.412793,4.302032,4.215065,4.158838,4.369925,4.326197,4.145620,4.063216,...,8.873402,5.156154,5.347466,4.259873,8.307840,4.241163,4.615357,5.207749,5.398709,OC
78,4.303287,5.306851,4.031435,4.163367,4.179652,3.772458,4.244065,4.375218,4.202162,4.146299,...,10.415541,3.710139,5.002297,3.890450,7.758487,4.168475,5.346251,5.150106,4.042413,OC
79,5.599717,5.632432,6.248742,4.544093,5.506081,5.105107,5.358509,5.572069,5.060560,4.865747,...,10.028393,4.719908,5.171122,5.306066,6.963918,4.449301,5.503396,5.920586,4.757867,OC


### Wykorzystanie klasyfikatorów

In [5]:
# klasyfikacja z uyciem svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


def svm_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Klasyfikacja przy użyciu SVM
    svm = SVC(kernel='linear', random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))


svm_classification(training_data, test_data)
svm_classification(training_data_synthetic, test_data)

Dokładność: 0.9405405405405406
              precision    recall  f1-score   support

          HC       0.93      0.99      0.96       131
          OC       0.98      0.81      0.89        54

    accuracy                           0.94       185
   macro avg       0.95      0.90      0.92       185
weighted avg       0.94      0.94      0.94       185

[[130   1]
 [ 10  44]]
Dokładność: 0.9405405405405406
              precision    recall  f1-score   support

          HC       0.93      0.99      0.96       131
          OC       0.98      0.81      0.89        54

    accuracy                           0.94       185
   macro avg       0.95      0.90      0.92       185
weighted avg       0.94      0.94      0.94       185

[[130   1]
 [ 10  44]]


In [6]:
from sklearn.tree import DecisionTreeClassifier
# klasyfikacja z uyciem random forest
def decision_tree_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    dt = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

decision_tree_classification(training_data, test_data)
decision_tree_classification(training_data_synthetic, test_data)

Dokładność: 0.7891891891891892
              precision    recall  f1-score   support

          HC       0.85      0.85      0.85       131
          OC       0.64      0.63      0.64        54

    accuracy                           0.79       185
   macro avg       0.74      0.74      0.74       185
weighted avg       0.79      0.79      0.79       185

[[112  19]
 [ 20  34]]
Dokładność: 0.8054054054054054
              precision    recall  f1-score   support

          HC       0.85      0.89      0.87       131
          OC       0.69      0.61      0.65        54

    accuracy                           0.81       185
   macro avg       0.77      0.75      0.76       185
weighted avg       0.80      0.81      0.80       185

[[116  15]
 [ 21  33]]


In [7]:
from sklearn.ensemble import RandomForestClassifier
# klasyfikacja z uyciem random forest
def random_forest_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

random_forest_classification(training_data, test_data)
random_forest_classification(training_data_synthetic, test_data)

Dokładność: 0.8378378378378378
              precision    recall  f1-score   support

          HC       0.82      0.99      0.90       131
          OC       0.96      0.46      0.62        54

    accuracy                           0.84       185
   macro avg       0.89      0.73      0.76       185
weighted avg       0.86      0.84      0.82       185

[[130   1]
 [ 29  25]]
Dokładność: 0.8486486486486486
              precision    recall  f1-score   support

          HC       0.82      1.00      0.90       131
          OC       1.00      0.48      0.65        54

    accuracy                           0.85       185
   macro avg       0.91      0.74      0.78       185
weighted avg       0.88      0.85      0.83       185

[[131   0]
 [ 28  26]]


In [8]:
from xgboost import XGBClassifier

# klasyfikacja z uyciem xgboost
def xgboost_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    y_train = y_train.replace('HC', 0)
    y_train = y_train.replace('OC', 1)
    y_test = y_test.replace('HC', 0)
    y_test = y_test.replace('OC', 1)
    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

xgboost_classification(training_data, test_data)
xgboost_classification(training_data_synthetic, test_data)

Dokładność: 0.9243243243243243
              precision    recall  f1-score   support

           0       0.91      0.98      0.95       131
           1       0.95      0.78      0.86        54

    accuracy                           0.92       185
   macro avg       0.93      0.88      0.90       185
weighted avg       0.93      0.92      0.92       185

[[129   2]
 [ 12  42]]
Dokładność: 0.8810810810810811
              precision    recall  f1-score   support

           0       0.89      0.95      0.92       131
           1       0.85      0.72      0.78        54

    accuracy                           0.88       185
   macro avg       0.87      0.83      0.85       185
weighted avg       0.88      0.88      0.88       185

[[124   7]
 [ 15  39]]


In [9]:
from sklearn.neural_network import MLPClassifier

# klasyfikacja z uzyciem sieci neuronowych

def nn_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    nn = MLPClassifier(hidden_layer_sizes=(100,100), random_state=42)
    nn.fit(X_train, y_train)
    y_pred = nn.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

nn_classification(training_data, test_data)
nn_classification(training_data_synthetic, test_data)

Dokładność: 0.9297297297297298
              precision    recall  f1-score   support

          HC       0.93      0.98      0.95       131
          OC       0.94      0.81      0.87        54

    accuracy                           0.93       185
   macro avg       0.93      0.90      0.91       185
weighted avg       0.93      0.93      0.93       185

[[128   3]
 [ 10  44]]
Dokładność: 0.9351351351351351
              precision    recall  f1-score   support

          HC       0.93      0.98      0.96       131
          OC       0.96      0.81      0.88        54

    accuracy                           0.94       185
   macro avg       0.94      0.90      0.92       185
weighted avg       0.94      0.94      0.93       185

[[129   2]
 [ 10  44]]
