In [1]:
!pip install wandb -qU

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.5/258.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score
from sklearn.preprocessing import normalize
from sklearn import metrics

from google.colab import drive

import numpy as np
import pandas as pd
import time
import os   # This module is used for interacting with the operating system. It provides a way to work with files and directories.

import tensorflow as tf
import wandb
from wandb.keras import WandbMetricsLogger, WandbModelCheckpoint

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [7]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
# --- Load data and setup directories ---
drive.mount('/content/gdrive')
!unzip gdrive/MyDrive/TCGA-BRCA_1079.zip

Mounted at /content/gdrive
Archive:  gdrive/MyDrive/TCGA-BRCA_1079.zip
  inflating: TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv  
  inflating: TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv  
  inflating: TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv  


In [5]:
# --- Prepare labels ---
subtype_labels_frame = pd.read_csv("TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv", sep="\t")
subtype_label_dataset = subtype_labels_frame.values
print ("Shape of subtype label dataset: ", subtype_label_dataset.shape)
print(subtype_label_dataset)
subtype_labels = subtype_label_dataset[:,1]
print ("Shape of subtype labels: ", subtype_labels.shape)
print(subtype_labels)

subtype_label_set = set(subtype_labels)
print(subtype_label_set)

subtype_labels = [1 if item == "Basal" else item for item in subtype_labels]
subtype_labels = [2 if item == "Her2" else item for item in subtype_labels]
subtype_labels = [3 if item == "LumA" else item for item in subtype_labels]
subtype_labels = [4 if item == "LumB" else item for item in subtype_labels]
subtype_labels = [5 if item == "Normal" else item for item in subtype_labels]
subtype_labels = np.array(subtype_labels)
print(subtype_labels.shape)
print(subtype_labels)

Shape of subtype label dataset:  (1079, 34)
[['TCGA-3C-AAAU-01' 'LumB' 0 ... 2 1 0]
 ['TCGA-3C-AALI-01' 'Her2' 0 ... 3 0 0]
 ['TCGA-3C-AALJ-01' 'LumB' 0 ... 3 0 0]
 ...
 ['TCGA-XX-A89A-01' 'LumA' 0 ... 1 0 0]
 ['TCGA-Z7-A8R5-01' 'LumA' 0 ... 1 0 0]
 ['TCGA-Z7-A8R6-01' 'LumB' 0 ... 3 0 0]]
Shape of subtype labels:  (1079,)
['LumB' 'Her2' 'LumB' ... 'LumA' 'LumA' 'LumB']
{'Normal', 'LumA', 'Her2', 'Basal', 'LumB'}
(1079,)
[4 2 4 ... 3 3 4]


In [8]:
# --- Prepare input data ---
input_data_frame = pd.read_csv("TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv", sep="\t")   # read .tsv file into memory
input_data = input_data_frame.values  # retrieve values as numpy array
print ("Shape of input data: ", input_data.shape) # rows are genes and columns are samples
print(input_data)

input_data = np.transpose(input_data) # rows are samples and columns are genes
input_data = np.delete(input_data, (0), axis=0) # delete first row of the data matrix
print ("Shape of prepared input data: ", input_data.shape)
print(input_data)

input_data = np.asarray(input_data).astype("float32") # Every character/number encoded with 32 Bit
print("--- Converted to Float32 ---")
print(input_data)

data = input_data
std_scaler = StandardScaler() # Standardize the dataset features so that each feature has mean=0 and standard_deviation=1
scaled_embeddings = std_scaler.fit_transform(data)
pca = PCA(n_components=10)
projected_embeddings = pca.fit_transform(scaled_embeddings)

np.savetxt("PCA_EM.txt", projected_embeddings)

# --- initialize a new run (<=> single unit of computation) ---
run = wandb.init(project="PCA-projected-BRCA-clustering", # Set the project where this run will be logged
                 name="10_PCAs_new_subtype_comparison", # We pass a run name otherwise it’ll be randomly assigned
                 # Track hyperparameters and run metadata
                 config={
                     "dataset": "BRCA_pca_projected",
                     "number_of_pca": 10,
                     "clustering_algorithm": "k-Means",
                     "centroid_initializations": 20,
                     "performance_aggregation": "mean of all initializations"
                 })

for typenum in range(2, 7, 1):
    all_silhouette = []
    all_DBI = []
    all_adj_rand = []
    for i in range(20):
        clf = KMeans(n_clusters=typenum)
        clf.fit(projected_embeddings)
        labels = clf.labels_
        silhouetteScore = silhouette_score(projected_embeddings, labels, metric='euclidean')
        all_silhouette.append(silhouetteScore)
        davies_bouldinScore = davies_bouldin_score(projected_embeddings, labels)
        all_DBI.append(davies_bouldinScore)
        adj_rand = adjusted_rand_score(subtype_labels, labels)
        all_adj_rand.append(adj_rand)
    avg_silhouette = np.mean(all_silhouette)
    std_silhouette = np.std(all_silhouette)
    avg_DBI = np.mean(all_DBI)
    std_DBI = np.std(all_DBI)
    avg_adj_rand = np.mean(all_adj_rand)
    std_adj_rand = np.std(all_adj_rand)
    print("Silhouette: ", avg_silhouette)
    print("DBI: ", avg_DBI)
    print("Adjusted Rand: ", avg_adj_rand)
    # Log performance metrics to W&B
    wandb.log({
              "mean_silhouette_score_k_equals_{k}".format(k=typenum): avg_silhouette,
              "mean_davis_bouldin_score_k_equals_{k}".format(k=typenum): avg_DBI,
              "std_silhouette_score_k_equals_{k}".format(k=typenum): std_silhouette,
              "std_davis_bouldin_score_k_equals_{k}".format(k=typenum): std_DBI
          })
    if typenum==5:
      wandb.log({
          "mean_adj_rand_index_k_equals_{k}".format(k=typenum): avg_adj_rand,
          "std_adj_rand_index_k_equals_{k}".format(k=typenum): std_adj_rand
      })

# Mark the run as finished
wandb.finish()

Shape of input data:  (17162, 1080)
[['A1BG' 0.1719444908272692 0.7300083449184781 ... 0.614468050540426
  1.3458846686508648 0.7186659277414379]
 ['A1CF' -0.814924951368837 2.386837959210663 ... 3.016626468680079
  -0.814924951368837 0.2356343444931301]
 ['A2M' -0.946474342215696 -0.5321448682087864 ... 0.5601782905367024
  1.512226893819039 -0.6446442624050204]
 ...
 ['DHCR7' 0.9104445032812696 0.3842357893348481 ... 0.7718469491250562
  -0.4217551872682497 -0.970509103357415]
 ['TMEM45B' -1.9635765960806173 1.268061022702539 ... -0.3477150909879061
  0.4043269887748967 -0.5444141861092432]
 ['GPR160' 0.6427457224400778 1.759067645972089 ... 0.3169998035848742
  -0.7956005603903231 -0.1338926768376846]]
Shape of prepared input data:  (1079, 17162)
[[0.1719444908272692 -0.814924951368837 -0.946474342215696 ...
  0.9104445032812696 -1.9635765960806173 0.6427457224400778]
 [0.7300083449184781 2.386837959210663 -0.5321448682087864 ...
  0.3842357893348481 1.268061022702539 1.759067645972

[34m[1mwandb[0m: Currently logged in as: [33mcosybio-compsysmed[0m. Use [1m`wandb login --relogin`[0m to force relogin




Silhouette:  0.22463596
DBI:  1.9297896767332987
Adjusted Rand:  0.2626025415337848




Silhouette:  0.19449611
DBI:  1.736621919444622
Adjusted Rand:  0.24951801231155096




Silhouette:  0.16023172
DBI:  1.7224705470384838
Adjusted Rand:  0.296049182232616




Silhouette:  0.14838943
DBI:  1.6826989790537206
Adjusted Rand:  0.23220504722567167




Silhouette:  0.14408751
DBI:  1.7106436785602486
Adjusted Rand:  0.2669278972164624


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
mean_adj_rand_index_k_equals_5,▁
mean_davis_bouldin_score_k_equals_2,▁
mean_davis_bouldin_score_k_equals_3,▁
mean_davis_bouldin_score_k_equals_4,▁
mean_davis_bouldin_score_k_equals_5,▁
mean_davis_bouldin_score_k_equals_6,▁
mean_silhouette_score_k_equals_2,▁
mean_silhouette_score_k_equals_3,▁
mean_silhouette_score_k_equals_4,▁
mean_silhouette_score_k_equals_5,▁

0,1
mean_adj_rand_index_k_equals_5,0.23221
mean_davis_bouldin_score_k_equals_2,1.92979
mean_davis_bouldin_score_k_equals_3,1.73662
mean_davis_bouldin_score_k_equals_4,1.72247
mean_davis_bouldin_score_k_equals_5,1.6827
mean_davis_bouldin_score_k_equals_6,1.71064
mean_silhouette_score_k_equals_2,0.22464
mean_silhouette_score_k_equals_3,0.1945
mean_silhouette_score_k_equals_4,0.16023
mean_silhouette_score_k_equals_5,0.14839
