In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
import umap
import hdbscan

  from .autonotebook import tqdm as notebook_tqdm


### Wczytanie danych

In [2]:
training_data = pd.read_csv('../classifier_data/HC_OC_training.csv', sep='\t', index_col=0)


print(training_data)

test_data = pd.read_csv('../classifier_data/HC_OC_test.csv', sep='\t', index_col=0)


print(test_data)

                               ENSG00000000419  ENSG00000000460  \
VUMC-HC-0033-TR2591                   3.635790         4.078496   
Vumc-HD-70-TR1062                     4.546459         4.363044   
VUMC-HC0053-DOT-HD-48h-TR3087         3.861049         4.079708   
Vumc-HD-149-TR932                     4.727528         3.937986   
Vumc-HD-36-TR1165                     4.402386         3.862107   
...                                        ...              ...   
TR3544-OVA-LUMC                       4.745738         4.289456   
TR4341-OVA-LUMC                       4.092927         4.020636   
Cath-Ova-CZE-022-TR2770               3.083092         4.274754   
TR3947-OVA-CATH                       3.576816         3.930204   
Cath-Ova-CZE-049-TR2731               3.779636         4.717135   

                               ENSG00000000938  ENSG00000001036  \
VUMC-HC-0033-TR2591                   4.432779         5.074815   
Vumc-HD-70-TR1062                     5.786592         4.4887

In [3]:
# keep only the columns that are in both datasets
# Znalezienie wspólnych kolumn
common_columns = training_data.columns.intersection(test_data.columns)

# Uporządkowanie kolumn w real_data
training_data = training_data[common_columns]

# Uporządkowanie kolumn w synthetic_data
test_data = test_data[common_columns]
# synthetic_data = real_data
# # # # add noise to synthetic data
# synthetic_data = synthetic_data + np.random.normal(0, 0.001, synthetic_data.shape)

In [4]:
training_data.describe()

Unnamed: 0,ENSG00000000419,ENSG00000000460,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001497,ENSG00000001629,ENSG00000001631,ENSG00000002330,ENSG00000002549,...,ENSG00000257923,ENSG00000258890,ENSG00000263563,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168,Group
count,278.0,278.0,278.0,278.0,278.0,278.0,278.0,278.0,278.0,278.0,...,278.0,278.0,278.0,278.0,278.0,278.0,278.0,278.0,278.0,278.0
mean,4.312911,4.110307,5.515457,4.703468,4.535794,4.039753,4.695119,4.58342,4.79615,5.159978,...,7.628701,4.409439,4.577222,4.483288,5.348049,4.459864,6.200487,5.199333,4.72809,0.291367
std,0.805121,0.57853,1.247236,0.522821,0.661682,0.727049,0.558479,0.669908,0.368203,0.909917,...,0.692858,0.861622,0.349168,0.561457,0.591832,1.117826,1.506739,1.177985,0.555509,0.455212
min,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,...,5.105966,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,0.0
25%,3.802979,3.762311,4.696203,4.481264,4.144286,3.506609,4.307304,4.248171,4.584386,4.701116,...,7.308699,3.86972,4.395317,4.167531,4.99807,3.571123,5.030074,4.313747,4.436789,0.0
50%,4.251197,4.136669,5.30562,4.705545,4.600594,4.03658,4.69956,4.561179,4.793947,5.082831,...,7.710763,4.34539,4.572896,4.483368,5.375563,4.179792,6.253969,5.206753,4.73442,0.0
75%,4.74767,4.472794,5.975199,4.986121,4.936263,4.497808,5.062279,4.977241,5.009427,5.572723,...,8.105805,4.882616,4.758044,4.817125,5.680205,5.377742,7.366203,6.003445,5.054724,1.0
max,6.767679,6.067968,9.793131,6.592116,6.372149,6.076003,6.545705,6.873423,6.065469,8.897936,...,8.954455,7.585211,6.557157,6.21178,7.131028,7.817975,11.225842,9.988463,6.606258,1.0


In [5]:
test_data.describe()

Unnamed: 0,ENSG00000000419,ENSG00000000460,ENSG00000000938,ENSG00000001036,ENSG00000001461,ENSG00000001497,ENSG00000001629,ENSG00000001631,ENSG00000002330,ENSG00000002549,...,ENSG00000257923,ENSG00000258890,ENSG00000263563,ENSG00000264538,ENSG00000266356,ENSG00000266714,ENSG00000269028,ENSG00000271043,ENSG00000272168,Group
count,185.0,185.0,185.0,185.0,185.0,185.0,185.0,185.0,185.0,185.0,...,185.0,185.0,185.0,185.0,185.0,185.0,185.0,185.0,185.0,185.0
mean,4.279268,4.120439,5.407516,4.739876,4.582939,3.986007,4.606486,4.573577,4.779196,5.151998,...,7.711113,4.373899,4.555756,4.432652,5.390868,4.350106,6.060914,5.100829,4.7381,0.291892
std,0.721486,0.515487,1.092092,0.470632,0.592127,0.682464,0.504905,0.667676,0.324951,0.771326,...,0.676603,0.735602,0.327084,0.52033,0.442905,0.980247,1.561162,1.192287,0.53748,0.455867
min,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,3.083092,...,5.445328,3.083092,3.083092,3.083092,4.388405,3.083092,3.083092,3.083092,3.083092,0.0
25%,3.811984,3.82352,4.645284,4.529107,4.254529,3.456565,4.320739,4.24296,4.603469,4.718681,...,7.392252,3.950087,4.379889,4.090836,5.059676,3.618149,4.7833,4.274964,4.45336,0.0
50%,4.187932,4.129796,5.259546,4.804204,4.647815,4.049833,4.597338,4.594354,4.772618,5.05978,...,7.755541,4.292373,4.614126,4.41395,5.371258,4.081793,5.955549,4.959521,4.765672,0.0
75%,4.736884,4.439268,6.027215,5.033146,4.942248,4.367981,4.935051,4.998213,4.984027,5.639733,...,8.162565,4.807953,4.749269,4.728381,5.684068,5.013587,7.306161,5.920427,5.028659,1.0
max,6.206412,5.417481,8.477129,5.696946,6.031846,6.363055,5.831794,6.546365,5.639733,7.268793,...,9.436642,6.827322,5.239932,6.250128,6.736777,6.989681,10.265634,9.488314,6.526054,1.0


### Wykorzystanie klasyfikatorów

In [6]:
# klasyfikacja z uyciem svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


def svm_classification(real_data, synthetic_data):
    # Połączenie danych rzeczywistych i syntetycznych
    combined_data = np.vstack((real_data, synthetic_data))

    # Normalizacja danych
    #scaler = StandardScaler()
    #combined_data = scaler.fit_transform(combined_data)

    # Stworzenie etykiet
    labels = np.hstack((np.ones(len(real_data)), np.zeros(len(synthetic_data))))

    # Podział na zbiór treningowy i testowy
    X_train, X_test, y_train, y_test = train_test_split(combined_data, labels, test_size=0.4, random_state=42)

    # Klasyfikacja przy użyciu SVM
    svm = SVC(kernel='linear', random_state=42)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))


svm_classification(training_data, test_data)

Dokładność: 0.543010752688172
              precision    recall  f1-score   support

         0.0       0.45      0.37      0.41        78
         1.0       0.60      0.67      0.63       108

    accuracy                           0.54       186
   macro avg       0.52      0.52      0.52       186
weighted avg       0.53      0.54      0.54       186

[[29 49]
 [36 72]]


In [8]:
from sklearn.tree import DecisionTreeClassifier
# klasyfikacja z uyciem random forest
def decision_tree_classification(real_data, synthetic_data):
    # Połączenie danych rzeczywistych i syntetycznych
    combined_data = np.vstack((real_data, synthetic_data))

    # Normalizacja danych
    scaler = StandardScaler()
    combined_data = scaler.fit_transform(combined_data)

    labels = combined_data['Group'].values
    combined_data.drop('Group', axis=1)

    print(combined_data)

    # Stworzenie etykiet
    # labels = np.hstack((np.ones(len(real_data)), np.zeros(len(synthetic_data))))

    # Podział na zbiór treningowy i testowy
    X_train, X_test, y_train, y_test = train_test_split(combined_data, labels, test_size=0.4, random_state=42)

    # Klasyfikacja
    dt = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

decision_tree_classification(training_data, test_data)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
from sklearn.ensemble import RandomForestClassifier
# klasyfikacja z uyciem random forest
def random_forest_classification(real_data, synthetic_data):
    # Połączenie danych rzeczywistych i syntetycznych
    combined_data = np.vstack((real_data, synthetic_data))

    # Normalizacja danych
    scaler = StandardScaler()
    combined_data = scaler.fit_transform(combined_data)

    # Stworzenie etykiet
    labels = np.hstack((np.ones(len(real_data)), np.zeros(len(synthetic_data))))

    # Podział na zbiór treningowy i testowy
    X_train, X_test, y_train, y_test = train_test_split(combined_data, labels, test_size=0.4, random_state=42)

    # Klasyfikacja
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

random_forest_classification(training_data, test_data)

Dokładność: 0.5698924731182796
              precision    recall  f1-score   support

         0.0       0.44      0.09      0.15        78
         1.0       0.58      0.92      0.71       108

    accuracy                           0.57       186
   macro avg       0.51      0.50      0.43       186
weighted avg       0.52      0.57      0.48       186

[[ 7 71]
 [ 9 99]]


In [None]:
from xgboost import XGBClassifier

# klasyfikacja z uyciem xgboost
def xgboost_classification(real_data, synthetic_data):
    # Połączenie danych rzeczywistych i syntetycznych
    combined_data = np.vstack((real_data, synthetic_data))
    
    # Normalizacja danych
    scaler = StandardScaler()
    combined_data = scaler.fit_transform(combined_data)

    # Stworzenie etykiet
    labels = np.hstack((np.ones(len(real_data)), np.zeros(len(synthetic_data))))

    # Podział na zbiór treningowy i testowy
    X_train, X_test, y_train, y_test = train_test_split(combined_data, labels, test_size=0.4, random_state=42)

    # Klasyfikacja
    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

xgboost_classification(training_data, test_data)   

Dokładność: 0.5591397849462365
              precision    recall  f1-score   support

         0.0       0.45      0.26      0.33        78
         1.0       0.59      0.78      0.67       108

    accuracy                           0.56       186
   macro avg       0.52      0.52      0.50       186
weighted avg       0.53      0.56      0.53       186

[[20 58]
 [24 84]]


In [None]:
from sklearn.neural_network import MLPClassifier

# klasyfikacja z uzyciem sieci neuronowych

def nn_classification(real_data, synthetic_data):
    # Połączenie danych rzeczywistych i syntetycznych
    combined_data = np.vstack((real_data, synthetic_data))

    # Normalizacja danych
    scaler = StandardScaler()
    combined_data = scaler.fit_transform(combined_data)

    # Stworzenie etykiet
    labels = np.hstack((np.ones(len(real_data)), np.zeros(len(synthetic_data))))

    # Podział na zbiór treningowy i testowy
    X_train, X_test, y_train, y_test = train_test_split(combined_data, labels, test_size=0.4, random_state=42)

    # Klasyfikacja
    nn = MLPClassifier(hidden_layer_sizes=(100,100), random_state=42)
    nn.fit(X_train, y_train)
    y_pred = nn.predict(X_test)

    # Wypisanie dokładności i innych metryk
    print(f"Dokładność: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

nn_classification(training_data, test_data)

Dokładność: 0.5591397849462365
              precision    recall  f1-score   support

         0.0       0.47      0.36      0.41        78
         1.0       0.60      0.70      0.65       108

    accuracy                           0.56       186
   macro avg       0.53      0.53      0.53       186
weighted avg       0.55      0.56      0.55       186

[[28 50]
 [32 76]]
