In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
import umap
import hdbscan

  from .autonotebook import tqdm as notebook_tqdm


### Wczytanie danych

In [2]:
hc_training = pd.read_csv('../../../classifier_data/HC_training.csv', sep='\t').T
hc_test = pd.read_csv('../../../classifier_data/HC_test.csv', sep='\t').T
oc_training = pd.read_csv('../../../classifier_data/OC_training.csv', sep='\t').T
oc_test = pd.read_csv('../../../classifier_data/OC_test.csv', sep='\t').T

synthetic_hc_data_path = '../../../best_models/HC/WGANWC/25/synthetic_data/'
model_id = os.listdir(synthetic_hc_data_path)[0]
hc_synthetic = pd.read_csv(f'{synthetic_hc_data_path}/{model_id}/generated_data.tsv', sep='\t')

synthetic_oc_data_path = '../../../best_models/OC/WGANWC/25/synthetic_data/'
model_id = os.listdir(synthetic_oc_data_path)[0]
oc_synthetic = pd.read_csv(f'{synthetic_oc_data_path}/{model_id}/generated_data.tsv', sep='\t')

# combine training and test data with labels
hc_training['label'] = 'HC'
hc_test['label'] = 'HC'
oc_training['label'] = 'OC'
oc_test['label'] = 'OC'

hc_synthetic['label'] = 'HC'
oc_synthetic['label'] = 'OC'

training_data = pd.concat([hc_training, oc_training])
test_data = pd.concat([hc_test, oc_test])

In [3]:
# keep only the columns that are in both datasets
# Znalezienie wspólnych kolumn
common_columns = training_data.columns.intersection(test_data.columns)
common_columns = common_columns.intersection(hc_synthetic.columns)
common_columns = common_columns.intersection(oc_synthetic.columns)

# Uporządkowanie kolumn w real_data
training_data = training_data[common_columns]

# Uporządkowanie kolumn w synthetic_data
test_data = test_data[common_columns]

# Uporządkowanie kolumn w synthetic_data
hc_synthetic = hc_synthetic[common_columns]
oc_synthetic = oc_synthetic[common_columns]

common_columns

Index(['ENSG00000081237', 'ENSG00000085265', 'ENSG00000090382',
       'ENSG00000110719', 'ENSG00000115523', 'ENSG00000119535',
       'ENSG00000137642', 'ENSG00000160255', 'ENSG00000177359',
       'ENSG00000198336', 'ENSG00000240356', 'ENSG00000244734',
       'ENSG00000257207', 'label'],
      dtype='object')

In [4]:
# add synthetic data to training data
training_data_synthetic = pd.concat([training_data, hc_synthetic, oc_synthetic])
training_data_synthetic

Unnamed: 0,ENSG00000081237,ENSG00000085265,ENSG00000090382,ENSG00000110719,ENSG00000115523,ENSG00000119535,ENSG00000137642,ENSG00000160255,ENSG00000177359,ENSG00000198336,ENSG00000240356,ENSG00000244734,ENSG00000257207,label
VUMC-HC-0033-TR2591,6.121758,4.078496,4.643048,4.169415,5.849773,4.399111,4.643048,4.796726,3.083092,7.670810,3.535280,6.995484,9.847484,HC
Vumc-HD-70-TR1062,9.448584,4.926369,5.721557,5.309622,7.717082,4.219669,6.106566,7.143903,5.079361,7.237932,3.701701,10.021953,8.262227,HC
VUMC-HC0053-DOT-HD-48h-TR3087,7.317863,4.845811,4.997802,5.151550,5.593316,3.083092,4.559495,5.935819,3.083092,3.720811,7.783065,7.050183,8.831693,HC
Vumc-HD-149-TR932,5.552469,4.492627,4.275620,3.544794,5.451417,3.083092,3.083092,4.753784,3.544794,3.083092,9.503594,8.661739,3.647372,HC
Vumc-HD-36-TR1165,9.002749,6.525333,5.811822,7.579274,7.306126,5.743631,5.998264,6.846139,4.172174,7.438673,5.671874,9.877834,3.862107,HC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,5.856607,4.410924,4.315591,5.810232,7.536771,6.670653,5.664708,5.206303,4.227684,4.449888,6.764823,11.605247,7.149803,OC
77,5.814561,4.558090,5.991567,4.578639,4.600465,3.926800,3.425197,6.537225,7.171918,5.027024,7.257698,9.665682,5.575982,OC
78,4.547694,5.901170,4.458024,4.300801,4.234932,6.650621,3.424514,3.814380,6.615067,8.153437,8.829521,10.122170,11.657073,OC
79,7.498326,5.560193,4.722758,7.025603,6.098729,7.446591,4.285898,6.165845,4.846074,3.510326,9.963324,11.372280,4.666277,OC


### Wykorzystanie klasyfikatorów

In [5]:
# klasyfikacja z uyciem svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


def svm_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Klasyfikacja przy użyciu SVM
    svm = SVC(kernel='linear', random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))


svm_classification(training_data, test_data)
svm_classification(training_data_synthetic, test_data)

Dokładność: 0.7513513513513513
              precision    recall  f1-score   support

          HC       0.79      0.89      0.84       131
          OC       0.61      0.41      0.49        54

    accuracy                           0.75       185
   macro avg       0.70      0.65      0.66       185
weighted avg       0.73      0.75      0.73       185

[[117  14]
 [ 32  22]]
Dokładność: 0.7513513513513513
              precision    recall  f1-score   support

          HC       0.79      0.88      0.83       131
          OC       0.60      0.44      0.51        54

    accuracy                           0.75       185
   macro avg       0.70      0.66      0.67       185
weighted avg       0.74      0.75      0.74       185

[[115  16]
 [ 30  24]]


In [6]:
from sklearn.tree import DecisionTreeClassifier
# klasyfikacja z uyciem random forest
def decision_tree_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    dt = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

decision_tree_classification(training_data, test_data)
decision_tree_classification(training_data_synthetic, test_data)

Dokładność: 0.7027027027027027
              precision    recall  f1-score   support

          HC       0.81      0.76      0.78       131
          OC       0.49      0.56      0.52        54

    accuracy                           0.70       185
   macro avg       0.65      0.66      0.65       185
weighted avg       0.71      0.70      0.71       185

[[100  31]
 [ 24  30]]
Dokładność: 0.6702702702702703
              precision    recall  f1-score   support

          HC       0.71      0.89      0.79       131
          OC       0.33      0.13      0.19        54

    accuracy                           0.67       185
   macro avg       0.52      0.51      0.49       185
weighted avg       0.60      0.67      0.62       185

[[117  14]
 [ 47   7]]


In [7]:
from sklearn.ensemble import RandomForestClassifier
# klasyfikacja z uyciem random forest
def random_forest_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

random_forest_classification(training_data, test_data)
random_forest_classification(training_data_synthetic, test_data)

Dokładność: 0.7621621621621621
              precision    recall  f1-score   support

          HC       0.76      0.96      0.85       131
          OC       0.75      0.28      0.41        54

    accuracy                           0.76       185
   macro avg       0.76      0.62      0.63       185
weighted avg       0.76      0.76      0.72       185

[[126   5]
 [ 39  15]]


Dokładność: 0.745945945945946
              precision    recall  f1-score   support

          HC       0.75      0.96      0.84       131
          OC       0.71      0.22      0.34        54

    accuracy                           0.75       185
   macro avg       0.73      0.59      0.59       185
weighted avg       0.74      0.75      0.70       185

[[126   5]
 [ 42  12]]


In [8]:
from xgboost import XGBClassifier

# klasyfikacja z uyciem xgboost
def xgboost_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    y_train = y_train.replace('HC', 0)
    y_train = y_train.replace('OC', 1)
    y_test = y_test.replace('HC', 0)
    y_test = y_test.replace('OC', 1)
    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

xgboost_classification(training_data, test_data)
xgboost_classification(training_data_synthetic, test_data)

Dokładność: 0.7675675675675676
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       131
           1       0.64      0.46      0.54        54

    accuracy                           0.77       185
   macro avg       0.72      0.68      0.69       185
weighted avg       0.75      0.77      0.76       185

[[117  14]
 [ 29  25]]
Dokładność: 0.7675675675675676
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       131
           1       0.67      0.41      0.51        54

    accuracy                           0.77       185
   macro avg       0.73      0.66      0.68       185
weighted avg       0.75      0.77      0.75       185

[[120  11]
 [ 32  22]]


In [9]:
from sklearn.neural_network import MLPClassifier

# klasyfikacja z uzyciem sieci neuronowych

def nn_classification(training_data, test_data):
    # Oddziel etykiety od cech
    X_train = training_data.drop('label', axis=1)
    y_train = training_data['label']
    X_test = test_data.drop('label', axis=1)
    y_test = test_data['label']

    # Normalizacja danych
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Klasyfikacja
    nn = MLPClassifier(hidden_layer_sizes=(100,100), random_state=42)
    nn.fit(X_train, y_train)
    y_pred = nn.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

nn_classification(training_data, test_data)
nn_classification(training_data_synthetic, test_data)



Dokładność: 0.7621621621621621
              precision    recall  f1-score   support

          HC       0.84      0.82      0.83       131
          OC       0.59      0.63      0.61        54

    accuracy                           0.76       185
   macro avg       0.71      0.72      0.72       185
weighted avg       0.77      0.76      0.76       185

[[107  24]
 [ 20  34]]
Dokładność: 0.7513513513513513
              precision    recall  f1-score   support

          HC       0.82      0.82      0.82       131
          OC       0.57      0.57      0.57        54

    accuracy                           0.75       185
   macro avg       0.70      0.70      0.70       185
weighted avg       0.75      0.75      0.75       185

[[108  23]
 [ 23  31]]


