In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
import umap
import hdbscan

  from .autonotebook import tqdm as notebook_tqdm


### Wczytanie danych

In [2]:
hc_training = pd.read_csv('../../../classifier_data/HC_training.csv', sep='\t').T
hc_test = pd.read_csv('../../../classifier_data/HC_test.csv', sep='\t').T
oc_training = pd.read_csv('../../../classifier_data/OC_training.csv', sep='\t').T
oc_test = pd.read_csv('../../../classifier_data/OC_test.csv', sep='\t').T

synthetic_hc_data_path = '../../../best_models/HC/WGANGP/100/synthetic_data/'
model_id = os.listdir(synthetic_hc_data_path)[0]
hc_synthetic = pd.read_csv(f'{synthetic_hc_data_path}/{model_id}/generated_data.tsv', sep='\t')

synthetic_oc_data_path = '../../../best_models/OC/WGANGP/100/synthetic_data/'
model_id = os.listdir(synthetic_oc_data_path)[0]
oc_synthetic = pd.read_csv(f'{synthetic_oc_data_path}/{model_id}/generated_data.tsv', sep='\t')

# combine training and test data with labels
hc_training['label'] = 'HC'
hc_test['label'] = 'HC'
oc_training['label'] = 'OC'
oc_test['label'] = 'OC'

hc_synthetic['label'] = 'HC'
oc_synthetic['label'] = 'OC'

training_data = pd.concat([hc_training, oc_training])
test_data = pd.concat([hc_test, oc_test])

In [3]:
# keep only the columns that are in both datasets
# Znalezienie wspólnych kolumn
common_columns = training_data.columns.intersection(test_data.columns)
common_columns = common_columns.intersection(hc_synthetic.columns)
common_columns = common_columns.intersection(oc_synthetic.columns)

# Uporządkowanie kolumn w real_data
training_data = training_data[common_columns]

# Uporządkowanie kolumn w synthetic_data
test_data = test_data[common_columns]

# Uporządkowanie kolumn w synthetic_data
hc_synthetic = hc_synthetic[common_columns]
oc_synthetic = oc_synthetic[common_columns]

common_columns

Index(['ENSG00000019582', 'ENSG00000028137', 'ENSG00000038427',
       'ENSG00000044574', 'ENSG00000049860', 'ENSG00000051523',
       'ENSG00000054654', 'ENSG00000076928', 'ENSG00000081237',
       'ENSG00000085265', 'ENSG00000090382', 'ENSG00000100242',
       'ENSG00000100906', 'ENSG00000106066', 'ENSG00000108654',
       'ENSG00000110719', 'ENSG00000115232', 'ENSG00000115523',
       'ENSG00000117289', 'ENSG00000119535', 'ENSG00000120129',
       'ENSG00000121316', 'ENSG00000126759', 'ENSG00000134516',
       'ENSG00000135046', 'ENSG00000136167', 'ENSG00000136250',
       'ENSG00000137642', 'ENSG00000140575', 'ENSG00000141524',
       'ENSG00000142347', 'ENSG00000147168', 'ENSG00000155657',
       'ENSG00000160255', 'ENSG00000163131', 'ENSG00000163682',
       'ENSG00000164733', 'ENSG00000166598', 'ENSG00000175606',
       'ENSG00000177359', 'ENSG00000184319', 'ENSG00000188404',
       'ENSG00000196126', 'ENSG00000197249', 'ENSG00000198336',
       'ENSG00000204287', 'ENSG000002118

In [4]:
# add synthetic data to training data
training_data_synthetic = pd.concat([training_data, hc_synthetic, oc_synthetic])
training_data_synthetic

Unnamed: 0,ENSG00000019582,ENSG00000028137,ENSG00000038427,ENSG00000044574,ENSG00000049860,ENSG00000051523,ENSG00000054654,ENSG00000076928,ENSG00000081237,ENSG00000085265,...,ENSG00000197249,ENSG00000198336,ENSG00000204287,ENSG00000211899,ENSG00000223553,ENSG00000240356,ENSG00000244734,ENSG00000257207,ENSG00000269028,label
VUMC-HC-0033-TR2591,7.136091,3.920710,3.720018,4.252125,4.432779,4.643048,5.318681,5.395822,6.121758,4.078496,...,4.328301,7.670810,5.318681,4.911210,5.112603,3.535280,6.995484,9.847484,7.757440,HC
Vumc-HD-70-TR1062,9.178061,4.546459,3.951486,5.653279,5.505492,6.338547,6.106566,6.461376,9.448584,4.926369,...,5.339434,7.237932,7.435968,6.633504,3.522177,3.701701,10.021953,8.262227,6.645036,HC
VUMC-HC0053-DOT-HD-48h-TR3087,7.631874,4.292393,3.535847,5.114817,5.038021,5.593316,5.383165,5.975680,7.317863,4.845811,...,4.292393,3.720811,5.631686,4.559495,4.030369,7.783065,7.050183,8.831693,4.253528,HC
Vumc-HD-149-TR932,6.897237,3.808559,3.647372,4.492627,3.083092,4.853377,3.647372,5.524362,5.552469,4.492627,...,3.937986,3.083092,5.606988,4.389800,5.839804,9.503594,8.661739,3.647372,8.517492,HC
Vumc-HD-36-TR1165,8.796382,5.339481,5.020424,3.862107,4.172174,7.306126,7.775699,7.918580,9.002749,6.525333,...,5.339481,7.438673,6.780833,6.525333,3.862107,5.671874,9.877834,3.862107,7.025844,HC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,9.321984,5.842310,4.530571,7.167894,3.934385,5.858116,4.620558,7.138732,6.475227,5.854874,...,6.083918,4.657970,5.847219,6.173883,6.581078,8.587284,7.525491,6.575850,5.236631,OC
77,8.170986,4.893297,3.815314,6.195567,5.575869,6.101062,6.370518,5.245416,7.411808,4.479786,...,6.384458,5.836070,7.033962,4.637765,5.456245,9.182244,7.172677,8.613216,6.202492,OC
78,9.519388,4.545824,4.252022,6.747844,4.823868,7.191055,5.644127,5.500541,8.513735,6.520533,...,5.148388,4.679676,6.702365,5.766850,6.031638,7.800298,8.957249,7.964325,8.317481,OC
79,9.223013,4.267719,4.191545,5.295183,5.973308,7.125298,4.804429,6.633883,7.651279,6.758712,...,5.069712,4.518939,6.093646,5.615087,5.007359,6.873875,7.969035,8.908411,4.929279,OC


### Wykorzystanie klasyfikatorów

In [5]:
# klasyfikacja z uyciem svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


def svm_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Klasyfikacja przy użyciu SVM
    svm = SVC(kernel='linear', random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))


svm_classification(training_data, test_data)
svm_classification(training_data_synthetic, test_data)

Dokładność: 0.8972972972972973
              precision    recall  f1-score   support

          HC       0.90      0.96      0.93       131
          OC       0.89      0.74      0.81        54

    accuracy                           0.90       185
   macro avg       0.89      0.85      0.87       185
weighted avg       0.90      0.90      0.89       185

[[126   5]
 [ 14  40]]
Dokładność: 0.8864864864864865
              precision    recall  f1-score   support

          HC       0.89      0.96      0.92       131
          OC       0.88      0.70      0.78        54

    accuracy                           0.89       185
   macro avg       0.89      0.83      0.85       185
weighted avg       0.89      0.89      0.88       185

[[126   5]
 [ 16  38]]


In [6]:
from sklearn.tree import DecisionTreeClassifier
# klasyfikacja z uyciem random forest
def decision_tree_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    dt = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

decision_tree_classification(training_data, test_data)
decision_tree_classification(training_data_synthetic, test_data)

Dokładność: 0.772972972972973
              precision    recall  f1-score   support

          HC       0.81      0.89      0.85       131
          OC       0.64      0.50      0.56        54

    accuracy                           0.77       185
   macro avg       0.73      0.69      0.70       185
weighted avg       0.76      0.77      0.76       185

[[116  15]
 [ 27  27]]
Dokładność: 0.8162162162162162
              precision    recall  f1-score   support

          HC       0.85      0.90      0.87       131
          OC       0.72      0.61      0.66        54

    accuracy                           0.82       185
   macro avg       0.78      0.76      0.77       185
weighted avg       0.81      0.82      0.81       185

[[118  13]
 [ 21  33]]


In [7]:
from sklearn.ensemble import RandomForestClassifier
# klasyfikacja z uyciem random forest
def random_forest_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

random_forest_classification(training_data, test_data)
random_forest_classification(training_data_synthetic, test_data)

Dokładność: 0.8432432432432433
              precision    recall  f1-score   support

          HC       0.83      0.98      0.90       131
          OC       0.93      0.50      0.65        54

    accuracy                           0.84       185
   macro avg       0.88      0.74      0.77       185
weighted avg       0.86      0.84      0.83       185

[[129   2]
 [ 27  27]]
Dokładność: 0.8162162162162162
              precision    recall  f1-score   support

          HC       0.81      0.96      0.88       131
          OC       0.83      0.46      0.60        54

    accuracy                           0.82       185
   macro avg       0.82      0.71      0.74       185
weighted avg       0.82      0.82      0.80       185

[[126   5]
 [ 29  25]]


In [8]:
from xgboost import XGBClassifier

# klasyfikacja z uyciem xgboost
def xgboost_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    y_train = y_train.replace('HC', 0)
    y_train = y_train.replace('OC', 1)
    y_test = y_test.replace('HC', 0)
    y_test = y_test.replace('OC', 1)
    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

xgboost_classification(training_data, test_data)
xgboost_classification(training_data_synthetic, test_data)

Dokładność: 0.8540540540540541
              precision    recall  f1-score   support

           0       0.86      0.95      0.90       131
           1       0.83      0.63      0.72        54

    accuracy                           0.85       185
   macro avg       0.85      0.79      0.81       185
weighted avg       0.85      0.85      0.85       185

[[124   7]
 [ 20  34]]
Dokładność: 0.7891891891891892
              precision    recall  f1-score   support

           0       0.81      0.92      0.86       131
           1       0.70      0.48      0.57        54

    accuracy                           0.79       185
   macro avg       0.76      0.70      0.72       185
weighted avg       0.78      0.79      0.78       185

[[120  11]
 [ 28  26]]


In [9]:
from sklearn.neural_network import MLPClassifier

# klasyfikacja z uzyciem sieci neuronowych

def nn_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    nn = MLPClassifier(hidden_layer_sizes=(100,100), random_state=42)
    nn.fit(X_train, y_train)
    y_pred = nn.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

nn_classification(training_data, test_data)
nn_classification(training_data_synthetic, test_data)

Dokładność: 0.8756756756756757
              precision    recall  f1-score   support

          HC       0.90      0.93      0.91       131
          OC       0.82      0.74      0.78        54

    accuracy                           0.88       185
   macro avg       0.86      0.84      0.85       185
weighted avg       0.87      0.88      0.87       185

[[122   9]
 [ 14  40]]
Dokładność: 0.8540540540540541
              precision    recall  f1-score   support

          HC       0.90      0.89      0.90       131
          OC       0.75      0.76      0.75        54

    accuracy                           0.85       185
   macro avg       0.82      0.83      0.82       185
weighted avg       0.85      0.85      0.85       185

[[117  14]
 [ 13  41]]
