### More results on the HIPC data with CytOpt

In this notebook, we estimate the class proportions in every data set of the HIPC data set using the data set Stanford1A as a source data set. We inform the reader that estimating the class proportions for the 61 data sets could last up to 2 hours. Instead of running this notebook, one can use the estimation results provided in the file `Res_Estimation_Stan1A.txt` and display those results with the notebook `Bland_Altman_Full_Target_HIPC`.

##### Import modules and functions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import sys
sys.path.append("../Functions")
from Tools_CytOpt_Descent_Ascent import *
from Tools_CytOpt_MinMax_Swapping import *

In [2]:
#Setting of the seed.
np.random.seed(2)

##### Import data

In [3]:
#Patient 1 replicate A 

Stanford1A_values = pd.read_csv('../Data/W2_1_values.csv',
                                 usecols = np.arange(1,8))
Stanford1A_clust = pd.read_csv('../Data/W2_1_clust.csv',
                                usecols = [1])
Yale1A_values = pd.read_csv('../Data/FTV_1_values.csv',
                            usecols = np.arange(1,8))
Yale1A_clust = pd.read_csv('../Data/FTV_1_clust.csv',
                           usecols = [1])
Ucla1A_values = pd.read_csv('../Data/IU_1_values.csv',
                            usecols = np.arange(1,8))
Ucla1A_clust = pd.read_csv('../Data/IU_1_clust.csv',
                           usecols = [1])
Nhlbi1A_values = pd.read_csv('../Data/D54_1_values.csv',
                             usecols = np.arange(1,8))
Nhlbi1A_clust = pd.read_csv('../Data/D54_1_clust.csv',
                            usecols = [1])
Cimr1A_values = pd.read_csv('../Data/O0_1_values.csv',
                            usecols = np.arange(1,8))
Cimr1A_clust = pd.read_csv('../Data/O0_1_clust.csv',
                           usecols = [1])
Baylor1A_values = pd.read_csv('../Data/pw_2_values.csv',
                              usecols = np.arange(1,8))
Baylor1A_clust = pd.read_csv('../Data/pw_2_clust.csv',
                             usecols = [1])
Miami1A_values = pd.read_csv('../Data/pM_1_values.csv', 
                             usecols = np.arange(1,8))
Miami1A_clust = pd.read_csv('../Data/pM_1_clust.csv',
                            usecols = [1])

#Patient 1 replicate 2

Stanford1B_values = pd.read_csv('../Data/W2_2_values.csv',
                                 usecols = np.arange(1,8))
Stanford1B_clust = pd.read_csv('../Data/W2_2_clust.csv',
                                usecols = [1])
Yale1B_values = pd.read_csv('../Data/FTV_2_values.csv',
                            usecols = np.arange(1,8))
Yale1B_clust = pd.read_csv('../Data/FTV_2_clust.csv',
                           usecols = [1])
Ucla1B_values = pd.read_csv('../Data/IU_2_values.csv',
                            usecols = np.arange(1,8))
Ucla1B_clust = pd.read_csv('../Data/IU_2_clust.csv',
                           usecols = [1])
Nhlbi1B_values = pd.read_csv('../Data/D54_2_values.csv',
                             usecols = np.arange(1,8))
Nhlbi1B_clust = pd.read_csv('../Data/D54_2_clust.csv',
                            usecols = [1])
Cimr1B_values = pd.read_csv('../Data/O0_2_values.csv',
                            usecols = np.arange(1,8))
Cimr1B_clust = pd.read_csv('../Data/O0_2_clust.csv',
                           usecols = [1])
Baylor1B_values = pd.read_csv('../Data/pw_2_values.csv',
                              usecols = np.arange(1,8))
Baylor1B_clust = pd.read_csv('../Data/pw_2_clust.csv',
                             usecols = [1])
Miami1B_values = pd.read_csv('../Data/pM_2_values.csv', 
                             usecols = np.arange(1,8))
Miami1B_clust = pd.read_csv('../Data/pM_2_clust.csv',
                            usecols = [1])

#Patient 1 replicate 3 

Stanford1C_values = pd.read_csv('../Data/W2_3_values.csv',
                                 usecols = np.arange(1,8))
Stanford1C_clust = pd.read_csv('../Data/W2_3_clust.csv',
                                usecols = [1])
Yale1C_values = pd.read_csv('../Data/FTV_3_values.csv',
                            usecols = np.arange(1,8))
Yale1C_clust = pd.read_csv('../Data/FTV_3_clust.csv',
                           usecols = [1])
Ucla1C_values = pd.read_csv('../Data/IU_3_values.csv',
                            usecols = np.arange(1,8))
Ucla1C_clust = pd.read_csv('../Data/IU_3_clust.csv',
                           usecols = [1])
Nhlbi1C_values = pd.read_csv('../Data/D54_3_values.csv',
                             usecols = np.arange(1,8))
Nhlbi1C_clust = pd.read_csv('../Data/D54_3_clust.csv',
                            usecols = [1])
Cimr1C_values = pd.read_csv('../Data/O0_3_values.csv',
                            usecols = np.arange(1,8))
Cimr1C_clust = pd.read_csv('../Data/O0_3_clust.csv',
                           usecols = [1])
Baylor1C_values = pd.read_csv('../Data/pw_3_values.csv',
                              usecols = np.arange(1,8))
Baylor1C_clust = pd.read_csv('../Data/pw_3_clust.csv',
                             usecols = [1])
Miami1C_values = pd.read_csv('../Data/pM_3_values.csv', 
                             usecols = np.arange(1,8))
Miami1C_clust = pd.read_csv('../Data/pM_3_clust.csv',
                            usecols = [1])

#Patient 2 replicate 1

Stanford2A_values = pd.read_csv('../Data/W2_4_values.csv',
                                 usecols = np.arange(1,8))
Stanford2A_clust = pd.read_csv('../Data/W2_4_clust.csv',
                                usecols = [1])
Yale2A_values = pd.read_csv('../Data/FTV_4_values.csv',
                            usecols = np.arange(1,8))
Yale2A_clust = pd.read_csv('../Data/FTV_4_clust.csv',
                           usecols = [1])
Ucla2A_values = pd.read_csv('../Data/IU_4_values.csv',
                            usecols = np.arange(1,8))
Ucla2A_clust = pd.read_csv('../Data/IU_4_clust.csv',
                           usecols = [1])
Nhlbi2A_values = pd.read_csv('../Data/D54_4_values.csv',
                             usecols = np.arange(1,8)) 
Nhlbi2A_clust = pd.read_csv('../Data/D54_4_clust.csv',
                            usecols = [1])
Cimr2A_values = pd.read_csv('../Data/O0_4_values.csv',
                            usecols = np.arange(1,8))
Cimr2A_clust = pd.read_csv('../Data/O0_4_clust.csv',
                           usecols = [1])
Baylor2A_values = pd.read_csv('../Data/pw_4_values.csv',
                              usecols = np.arange(1,8))
Baylor2A_clust = pd.read_csv('../Data/pw_4_clust.csv',
                             usecols = [1])
Miami2A_values = pd.read_csv('../Data/pM_4_values.csv', 
                             usecols = np.arange(1,8))
Miami2A_clust = pd.read_csv('../Data/pM_4_clust.csv',
                            usecols = [1])

#Patient 2 replicate 2

Stanford2B_values = pd.read_csv('../Data/W2_5_values.csv',
                                 usecols = np.arange(1,8))
Stanford2B_clust = pd.read_csv('../Data/W2_5_clust.csv',
                                usecols = [1])
Yale2B_values = pd.read_csv('../Data/FTV_5_values.csv',
                            usecols = np.arange(1,8))
Yale2B_clust = pd.read_csv('../Data/FTV_5_clust.csv',
                           usecols = [1])
Ucla2B_values = pd.read_csv('../Data/IU_5_values.csv',
                            usecols = np.arange(1,8))
Ucla2B_clust = pd.read_csv('../Data/IU_5_clust.csv',
                           usecols = [1])
Nhlbi2B_values = pd.read_csv('../Data/D54_5_values.csv',
                             usecols = np.arange(1,8)) 
Nhlbi2B_clust = pd.read_csv('../Data/D54_5_clust.csv',
                            usecols = [1])
Cimr2B_values = pd.read_csv('../Data/O0_5_values.csv',
                            usecols = np.arange(1,8))
Cimr2B_clust = pd.read_csv('../Data/O0_5_clust.csv',
                           usecols = [1])
Baylor2B_values = pd.read_csv('../Data/pw_5_values.csv',
                              usecols = np.arange(1,8))
Baylor2B_clust = pd.read_csv('../Data/pw_5_clust.csv',
                             usecols = [1])
Miami2B_values = pd.read_csv('../Data/pM_5_values.csv', 
                             usecols = np.arange(1,8))
Miami2B_clust = pd.read_csv('../Data/pM_5_clust.csv',
                            usecols = [1])

#Patient 2 replicate 3

Stanford2C_values = pd.read_csv('../Data/W2_6_values.csv',
                                 usecols = np.arange(1,8))
Stanford2C_clust = pd.read_csv('../Data/W2_6_clust.csv',
                                usecols = [1])
Yale2C_values = pd.read_csv('../Data/FTV_6_values.csv',
                            usecols = np.arange(1,8))
Yale2C_clust = pd.read_csv('../Data/FTV_6_clust.csv',
                           usecols = [1])
Ucla2C_values = pd.read_csv('../Data/IU_6_values.csv',
                            usecols = np.arange(1,8))
Ucla2C_clust = pd.read_csv('../Data/IU_6_clust.csv',
                           usecols = [1])
Nhlbi2C_values = pd.read_csv('../Data/D54_6_values.csv',
                             usecols = np.arange(1,8)) 
Nhlbi2C_clust = pd.read_csv('../Data/D54_6_clust.csv',
                            usecols = [1])
Cimr2C_values = pd.read_csv('../Data/O0_6_values.csv',
                            usecols = np.arange(1,8))
Cimr2C_clust = pd.read_csv('../Data/O0_6_clust.csv',
                           usecols = [1])
Baylor2C_values = pd.read_csv('../Data/pw_6_values.csv',
                              usecols = np.arange(1,8))
Baylor2C_clust = pd.read_csv('../Data/pw_6_clust.csv',
                             usecols = [1])
Miami2C_values = pd.read_csv('../Data/pM_6_values.csv', 
                             usecols = np.arange(1,8))
Miami2C_clust = pd.read_csv('../Data/pM_6_clust.csv',
                            usecols = [1])

#Patient 3 replicate 1

Stanford3A_values = pd.read_csv('../Data/W2_7_values.csv',
                                 usecols = np.arange(1,8))
Stanford3A_clust = pd.read_csv('../Data/W2_7_clust.csv',
                                usecols = [1])
Yale3A_values = pd.read_csv('../Data/FTV_7_values.csv',
                            usecols = np.arange(1,8))
Yale3A_clust = pd.read_csv('../Data/FTV_7_clust.csv',
                           usecols = [1])
Ucla3A_values = pd.read_csv('../Data/IU_7_values.csv',
                            usecols = np.arange(1,8))
Ucla3A_clust = pd.read_csv('../Data/IU_7_clust.csv',
                           usecols = [1])
Nhlbi3A_values = pd.read_csv('../Data/D54_7_values.csv',
                             usecols = np.arange(1,8))
Nhlbi3A_clust = pd.read_csv('../Data/D54_7_clust.csv',
                            usecols = [1])
Cimr3A_values = pd.read_csv('../Data/O0_7_values.csv',
                            usecols = np.arange(1,8))
Cimr3A_clust = pd.read_csv('../Data/O0_7_clust.csv',
                           usecols = [1])
Baylor3A_values = pd.read_csv('../Data/pw_7_values.csv',
                              usecols = np.arange(1,8))
Baylor3A_clust = pd.read_csv('../Data/pw_7_clust.csv',
                             usecols = [1])
Miami3A_values = pd.read_csv('../Data/pM_7_values.csv', 
                             usecols = np.arange(1,8))
Miami3A_clust = pd.read_csv('../Data/pM_7_clust.csv',
                            usecols = [1])

#Patient 3 replicate 2

Stanford3B_values = pd.read_csv('../Data/W2_8_values.csv',
                                 usecols = np.arange(1,8))
Stanford3B_clust = pd.read_csv('../Data/W2_8_clust.csv',
                                usecols = [1])
Yale3B_values = pd.read_csv('../Data/FTV_8_values.csv',
                            usecols = np.arange(1,8))
Yale3B_clust = pd.read_csv('../Data/FTV_8_clust.csv',
                           usecols = [1])
Ucla3B_values = pd.read_csv('../Data/IU_8_values.csv',
                            usecols = np.arange(1,8))
Ucla3B_clust = pd.read_csv('../Data/IU_8_clust.csv',
                           usecols = [1])
Nhlbi3B_values = pd.read_csv('../Data/D54_8_values.csv',
                             usecols = np.arange(1,8))
Nhlbi3B_clust = pd.read_csv('../Data/D54_8_clust.csv',
                            usecols = [1])
Cimr3B_values = pd.read_csv('../Data/O0_8_values.csv',
                            usecols = np.arange(1,8))
Cimr3B_clust = pd.read_csv('../Data/O0_8_clust.csv',
                           usecols = [1])
Baylor3B_values = pd.read_csv('../Data/pw_8_values.csv',
                              usecols = np.arange(1,8))
Baylor3B_clust = pd.read_csv('../Data/pw_8_clust.csv',
                             usecols = [1])
Miami3B_values = pd.read_csv('../Data/pM_8_values.csv', 
                             usecols = np.arange(1,8))
Miami3B_clust = pd.read_csv('../Data/pM_8_clust.csv',
                            usecols = [1])

#Patient 3 replicate 3

Stanford3C_values = pd.read_csv('../Data/W2_9_values.csv',
                                 usecols = np.arange(1,8))
Stanford3C_clust = pd.read_csv('../Data/W2_9_clust.csv',
                                usecols = [1])
Yale3C_values = pd.read_csv('../Data/FTV_9_values.csv',
                            usecols = np.arange(1,8))
Yale3C_clust = pd.read_csv('../Data/FTV_9_clust.csv',
                           usecols = [1])
Ucla3C_values = pd.read_csv('../Data/IU_9_values.csv',
                            usecols = np.arange(1,8))
Ucla3C_clust = pd.read_csv('../Data/IU_9_clust.csv',
                           usecols = [1])
Nhlbi3C_values = pd.read_csv('../Data/D54_9_values.csv',
                             usecols = np.arange(1,8))
Nhlbi3C_clust = pd.read_csv('../Data/D54_9_clust.csv',
                            usecols = [1])
Cimr3C_values = pd.read_csv('../Data/O0_9_values.csv',
                            usecols = np.arange(1,8))
Cimr3C_clust = pd.read_csv('../Data/O0_9_clust.csv',
                           usecols = [1])
Baylor3C_values = pd.read_csv('../Data/pw_9_values.csv',
                              usecols = np.arange(1,8))
Baylor3C_clust = pd.read_csv('../Data/pw_9_clust.csv',
                             usecols = [1])
Miami3C_values = pd.read_csv('../Data/pM_9_values.csv', 
                             usecols = np.arange(1,8))
Miami3C_clust = pd.read_csv('../Data/pM_9_clust.csv',
                            usecols = [1])

In [4]:
Data_Target = [Stanford1A_values, Yale1A_values, Ucla1A_values,
    Nhlbi1A_values, Cimr1A_values, Baylor1A_values, Miami1A_values,
              Stanford2A_values, Yale2A_values, Ucla2A_values,
    Nhlbi2A_values, Cimr2A_values, Baylor2A_values, Miami2A_values,
              Stanford3A_values, Yale3A_values, Ucla3A_values,
    Nhlbi3A_values, Cimr3A_values, Baylor3A_values, Miami3A_values,
              Stanford1B_values, Yale1B_values, Ucla1B_values,
    Nhlbi1B_values, Cimr1B_values, Baylor1B_values, Miami1B_values,
              Stanford2B_values, Yale2B_values, Ucla2B_values,
    Nhlbi2B_values, Cimr2B_values, Baylor2B_values, Miami2B_values,
              Stanford3B_values, Yale3B_values, Ucla3B_values,
    Nhlbi3B_values, Cimr3B_values, Baylor3B_values, Miami3B_values,
              Stanford1C_values, Yale1C_values, Ucla1C_values,
    Nhlbi1C_values, Cimr1C_values, Baylor1C_values, Miami1C_values,
              Stanford2C_values, Yale2C_values, Ucla2C_values,
    Nhlbi2C_values, Cimr2C_values, Baylor2C_values, Miami2C_values,
              Stanford3C_values, Yale3C_values, Ucla3C_values,
    Nhlbi3C_values, Cimr3C_values, Baylor3C_values, Miami3C_values]
    

Label_Target = [Stanford1A_clust, Yale1A_clust, Ucla1A_clust,
    Nhlbi1A_clust, Cimr1A_clust, Baylor1A_clust, Miami1A_clust,
              Stanford2A_clust, Yale2A_clust, Ucla2A_clust,
    Nhlbi2A_clust, Cimr2A_clust, Baylor2A_clust, Miami2A_clust,
              Stanford3A_clust, Yale3A_clust, Ucla3A_clust,
    Nhlbi3A_clust, Cimr3A_clust, Baylor3A_clust, Miami3A_clust,
              Stanford1B_clust, Yale1B_clust, Ucla1B_clust,
    Nhlbi1B_clust, Cimr1B_clust, Baylor1B_clust, Miami1B_clust,
              Stanford2B_clust, Yale2B_clust, Ucla2B_clust,
    Nhlbi2B_clust, Cimr2B_clust, Baylor2B_clust, Miami2B_clust,
              Stanford3B_clust, Yale3B_clust, Ucla3B_clust,
    Nhlbi3B_clust, Cimr3B_clust, Baylor3B_clust, Miami3B_clust,
              Stanford1C_clust, Yale1C_clust, Ucla1C_clust,
    Nhlbi1C_clust, Cimr1C_clust, Baylor1C_clust, Miami1C_clust,
              Stanford2C_clust, Yale2C_clust, Ucla2C_clust,
    Nhlbi2C_clust, Cimr2C_clust, Baylor2C_clust, Miami2C_clust,
              Stanford3C_clust, Yale3C_clust, Ucla3C_clust,
    Nhlbi3C_clust, Cimr3C_clust, Baylor3C_clust, Miami3C_clust]


Names = ['Stanford1A', 'Yale1A', 'Ucla1A', 'Nhlbi1A', 'Cimr1A', 'Baylor1A', 'Miami1A',
         'Stanford2A', 'Yale2A', 'Ucla2A', 'Nhlbi2A', 'Cimr2A', 'Baylor2A', 'Miami2A',
         'Stanford3A', 'Yale3A', 'Ucla3A', 'Nhlbi3A', 'Cimr3A', 'Baylor3A', 'Miami3A',
         'Stanford1B', 'Yale1B', 'Ucla1B', 'Nhlbi1B', 'Cimr1B', 'Baylor1B', 'Miami1B',
         'Stanford2B', 'Yale2B', 'Ucla2B', 'Nhlbi2B', 'Cimr2B', 'Baylor2B', 'Miami2B',
         'Stanford3B', 'Yale3B', 'Ucla3B', 'Nhlbi3B', 'Cimr3B', 'Baylor3B', 'Miami3B',
         'Stanford1C', 'Yale1C', 'Ucla1C', 'Nhlbi1C', 'Cimr1C', 'Baylor1C', 'Miami1C',
         'Stanford2C', 'Yale2C', 'Ucla2C', 'Nhlbi2C', 'Cimr2C', 'Baylor2C', 'Miami2C',
         'Stanford3C', 'Yale3C', 'Ucla3C', 'Nhlbi3C', 'Cimr3C', 'Baylor3C', 'Miami3C']

Name_class = ['Classe {}'.format(k+1) for k in range(10)]

In [5]:
#Selection of the source data set.
#We selected the first data set : Stanford1A
d=0

Current_Data_Tar = Data_Target.copy()
Current_Lab_Tar = Label_Target.copy()
Current_Names = Names.copy()

del Current_Data_Tar[d]
del Current_Lab_Tar[d]
del Current_Names[d]

#Preprocessing of the source data set
X_source = np.asarray(Data_Target[d])
X_source = X_source * (X_source > 0)
scaler = MinMaxScaler()
X_source = scaler.fit_transform(X_source)
Lab_source = np.asarray(Label_Target[d]['x'])

N = len(Current_Data_Tar)

##### Setting of the parameters for the descent-ascent procedure

In [12]:
eps = 0.0001
lbd = 0.0001
n_iter = 10000
step_grad = 5
power = 0.99

h_hat_storage = np.zeros((N,10))
h_true_storage = np.zeros((N,10))

In [13]:
t0 = time.time()

for it, X_tar, Lab_tar in zip(np.arange(N), Current_Data_Tar, Current_Lab_Tar):
    
    print(Names[it+1])
    #Preprocessing of the target data set.
    X_tar = np.asarray(X_tar)
    X_tar = X_tar * (X_tar > 0)
    X_tar = scaler.fit_transform(X_tar)
    
    #Computation of the benchmark proportions in the target data set.
    Lab_tar = np.asarray(Lab_tar['x'])
    h_true = np.zeros(10)
    for k in range(10):
        h_true[k] = np.sum(Lab_tar == k+1)/len(Lab_tar)
    h_true_storage[it,:] = h_true
    
    #Class proportions estimation with CytOpt
    h_hat = cytopt_minmax(X_source, X_tar, Lab_source, eps=eps, lbd=lbd, n_iter=n_iter,
                  step=step_grad, power=power, monitoring=False)[0]
    h_hat_storage[it,:] = h_hat
    
elapsed_time = time.time() - t0

Yale1A
Ucla1A
Nhlbi1A
Cimr1A
Baylor1A
Miami1A


KeyboardInterrupt: 

In [14]:
print('Elapsed time :', elapsed_time/60, ' Mins')

NameError: name 'elapsed_time' is not defined

Storage of the estimation results in dataframes.

In [15]:
h_hat_data_frame = pd.DataFrame(h_hat_storage,
                                columns = ['Classe {}'.format(k+1) for k in range(10)],
                                index = Current_Names)

h_true_data_frame = pd.DataFrame(h_true_storage, 
                                columns = ['Classe {}'.format(k+1) for k in range(10)],
                                index = Current_Names)

##### Storage of the results in csv files

In [20]:
h_hat_data_frame.to_csv('Res_Estimation_Source_{}.txt'.format(Names[d]))
h_true_data_frame.to_csv('True_proportion_Source_{}.txt'.format(Names[d]))