## **Install requirements**

In [None]:
!pip install vantage6==3.7.3

**Imports**

In [None]:
from vantage6.client import Client
import time
import numpy as np
import pandas as pd

## **Vantage6 Server Connection**

Fill here the Vantage6 server information and credentials

In [None]:
# config.py

server_url = "http://35.157.139.38"
server_port = 443 # This is specified when you first created the server
server_api = "/api" # This is specified when you first created the server

username = "Varsha"
password = ""

organization_key = None

Authenticate with the server

In [None]:
# Initialize the client object, and run the authentication
client = Client(server_url, server_port, server_api, verbose=True)
client.authenticate(username, password)

# Optional: setup the encryption, if you have an organization_key
client.setup_encryption(organization_key)

Check the node status

In [None]:
org = client.node.list()
org

In [None]:
# delete all tasks 
# Clear the list of tasks (any tasks that cannot be completed will constantly run otherwise)
tasks = client.task.list(per_page = 1000)
#print(tasks)

t = tasks['data']
for task in t:
    #print(task)
    client.task.delete(id_=task['id'])

In [None]:
result = client.result.get(74)
print(result)

## **Step 1 - Query required data**


In [None]:
# Submit new task and wait for results

time_col_name = 'metastasisdays'
event_col_name = 'metastasis'

input_ = {
    'master': True,
    'method': 'master',
    'kwargs': {'feature_type': 'Radiomics',  ## Clinical or Radiomics or Combined
               'time_col': time_col_name,
               'outcome_col': event_col_name,
               'expl_vars': ['Fszm_sze', 'Fdzm_lgze', 'Fstat_skew', 'Fcm_info_corr_1'], ## required for radiomics and combined model, replace . with _
               'organization_ids': [2, 3, 4] # here list the organizations ids
               }
}

task = client.task.create(name="Querying task",
                               description="Send SPARQL queries to fetch required data based on the given feature type",
                               image="varshagouthamchand/dcr_sparql_query",
                               collaboration=1,
                               input=input_,
                               organizations=[2], ## aggregator node
                               database='rdf')

#print("Waiting for results")
task_id = task['id']
task_info = client.task.get(task_id)
while not task_info.get("complete"):
    task_info = client.task.get(task_id, include_results=True)
    #print("Waiting for results")
    time.sleep(3)

print("Results are ready!")

result_id = task_info['id']
result_info = client.result.list(task=result_id)

result = result_info['data'][0]['result']
print(result)

In [None]:
result = client.result.get(423)
print(result)

## **Step 2 - Distributed Feature Selection**

Apply on the distributed training set **Correlation-based Feature Selection** (Step  2.1) and **Cox Proportional Hazards Regression Model with LASSO regularization** (Step 2.2)

### **Step 2.1 - Correlation-based Feature Selection**


In [None]:
time_col_name = 'recurrencedays'
event_col_name = 'recurrence'

# Submit new task and wait for results
input_ = {
    'master': True,
    'method': 'master',
    'kwargs': {'expl_vars': [] ## leave it empty as it fetches all the common features or add features to be considered for correlation
             , 'outcome_col': event_col_name
             , 'organization_ids': [2, 7, 8] 
             , 'roitype': "Primary"  ## Primary or Node or Combined
             , 'feature_type': 'Radiomics'  ## Radiomics or Combined
             , 'oropharynx': 'yes'
                }
    }

task = client.task.create(name="Correlation-base Feature Selection",
                               description="testing corr",
                               image="varshagouthamchand/dcr_sparql_corr",
                               collaboration=3,
                               input=input_,
                               organizations=[2],
                               database='rdf')

print("Waiting for results")
task_id = task['id']
task_info = client.task.get(task_id)
while not task_info.get("complete"):
    task_info = client.task.get(task_id, include_results=True)
    print("Waiting for results")
    time.sleep(3)

print("Results are ready!")

result_id = task_info['id']
result_info = client.result.list(task=result_id)

corr_result = result_info['data'][0]['result']
print(corr_result)
corr_result['correlationMatrix'].to_csv('corr_result_csv.csv')
best_subset = corr_result['best_subset']
print(best_subset)

### **Step 2.2.1 - Cox Regression with LASSO**

Train a Cox Regression Proportional Hazard Model with LASSO Regularization on the training set of each node using as predictor the best subset in output from the CFS (Step 2.1).


In [None]:
time_col_name = 'metastasisdays'
event_col_name = 'metastasis'

# Submit new task and wait for results
input_ = {
    'master': 1,
    'method': 'master',
    'kwargs': {
          'expl_vars': ['Frlm_rlnu_norm', 'Fmorph_sph_sphericity', 'Fszm_zlnu_norm', 'Fcm_corr']
        , 'time_col': time_col_name
        , 'outcome_col': event_col_name
        , 'feature_type': 'Radiomics' ## Clinical or Radiomics or Combined
        , 'oropharynx': 'yes' ##yes or no
        , 'roitype': "Node" ## Primary or Node
        , 'n_lambda': 50
        , 'lambda_range': (0, 50)
        , 'beta_start': None
        , 'epsilon': 0.00000001
        , 'epochs': 150
        , 'organization_ids': [2, 7, 8]}
}

task = client.task.create(name="Lasso Cox model",
                               description="Train a Cox Regression Proportional Hazard Model with LASSO Regularization on the training set of each node using as predictor the best subset in output from the CFS",
                               image="varshagouthamchand/dcr_sparql_lasso",
                               collaboration=3,
                               input=input_,
                               organizations=[2],
                               database='rdf')


print("Waiting for results")
task_id = task['id']
task_info = client.task.get(task_id)
while not task_info.get("complete"):
    task_info = client.task.get(task_id, include_results=True)
    print("Waiting for results")
    time.sleep(3)

print("Results are ready!")

result_id = task_info['id']
result_info = client.result.list(task=result_id)

lasso_results = result_info['data'][0]['result']
lasso_path = lasso_results['model']
print(lasso_path)
lasso_path.to_csv('lasso_path.csv')


In [None]:
result = client.result.get(216)
print(result)
lasso_results = result['result']
print(lasso_results)
lasso_path = lasso_results['model']
print(lasso_path)
lasso_path.to_csv('lasso_path.csv')

In [None]:
## **Coxph** (if needed to run individually)

In [None]:
time_col_name = 'overallsurvivaldays'
event_col_name = 'survival'
expl_subset = ['lp_Clinical_all', 'lp_Radiomics_Primary', 'lp_Radiomics_Node']
#expl_subset = ['treatment_Chemotherapy', 'N1orLower', 'hpv_HPV Negative', 'hpv_HPV Positive']
#expl_subset = ['Fszm_lgze', 'Fcm_corr']

# Submit new task and wait for results
input_ = {
    'master': 1,
    'method': 'master',
    'kwargs': {
         # 'expl_vars':  best_subset or combined_strings
          'expl_vars': expl_subset
        , 'time_col': time_col_name
        , 'outcome_col': event_col_name
        , 'feature_type': 'LP'
        , 'oropharynx': 'yes'
        , 'roitype': "Primary"
        , 'organization_ids': [2, 7, 8]}
}

task = client.task.create(name="Coxph model",
                               description="Train a Cox Regression Proportional Hazard Model using as predictor the subsets in output from LASSO",
                               image="varshagouthamchand/dcr_sparql_coxph",
                               collaboration=1,
                               input=input_,
                               organizations=[2],
                               database='rdf')


print("Waiting for results")
task_id = task['id']
task_info = client.task.get(task_id)
while not task_info.get("complete"):
    task_info = client.task.get(task_id, include_results=True)
    print("Waiting for results")
    time.sleep(3)

print("Results are ready!")

result_id = task_info['id']
result_info = client.result.list(task=result_id)

cox_results = result_info['data'][0]['result']
print(cox_results)
#full_cox_results = cox_results['results']
coeff = cox_results['Coef'].to_dict()
print(coeff)


In [None]:
## **Leave one out cross validation for optimal feature selection**

In [None]:
# coxph task 

def coxph(organization_ids, column_names):
    """"""
    time_col_name = 'overallsurvivaldays'
    event_col_name = 'survival'
    
    input_ = {
        'master': 1,
        'method': 'master',
        'kwargs': {
             # 'expl_vars':  best_subset or combined_strings
              'expl_vars': column_names
            , 'time_col': time_col_name
            , 'outcome_col': event_col_name
            , 'feature_type': 'Radiomics'
            , 'oropharynx': 'yes'
            , 'roitype': "Primary"
            , 'organization_ids': organization_ids}
    }

    task = client.task.create(name="Coxph model",
                                   description="Train a Cox Regression Proportional Hazard Model using as predictor the subsets in output from LASSO",
                                   image="varshagouthamchand/dcr_sparql_coxph",
                                   collaboration=1,
                                   input=input_,
                                   organizations=[2],
                                   database='rdf')


    print("Waiting for results")
    task_id = task['id']
    task_info = client.task.get(task_id)
    while not task_info.get("complete"):
        task_info = client.task.get(task_id, include_results=True)
        print("Waiting for results")
        time.sleep(3)

    print("Results are ready!")

    result_id = task_info['id']
    result_info = client.result.list(task=result_id)

    cox_results = result_info['data'][0]['result']
    #print(cox_results)
    coeff = cox_results['Coef'].to_dict()
    return coeff, cox_results
    

In [None]:
# validate task

def validation(output_data, organization_ids):
    """"""
    time_col_name = 'overallsurvivaldays'
    event_col_name = 'survival'

    input_ = {
            "master": True,
            "method": "master",
            # kwargs which are inserted into the algorithm
            'kwargs': {
                  'coefficients': output_data
                , 'time_col': time_col_name
                , 'outcome_col': event_col_name
                , 'feature_type': 'Radiomics'
                , 'oropharynx': 'yes'
                , 'roitype': "Primary"
                , 'organization_ids': organization_ids
            }
        }

    # Send the task to the central server
    task = client.task.create(name='validation',
                                   description="test validation",
                                   collaboration=1,
                                   organizations=[2],
                                   image="varshagouthamchand/dcr_sparql_validation",
                                   input=input_,
                                   database='rdf'
                                   )
    task_id = task['id']
    task_info = client.task.get(task_id)
    while not task_info.get("complete"):
        task_info = client.task.get(task_id, include_results=True)
        print("Waiting for results")
        time.sleep(3)

    print("Results are ready!")

    result_id = task_info['id']
    result_info = client.result.list(task=result_id)

    validation_result = result_info['data'][0]['result']
    output = validation_result['cindex'][0]
    return output

In [None]:
#cox regression with selected lamba values and leave one out validation
#result for each run to be saved in a text file
with open('LOOCV.txt', 'a') as f:
    f.write('\nCross Validation results\n')
organization_list = [2, 7, 8]

c_index = []
global_cindex = 0
# perform cox regression
for val_id in organization_list:
    print(val_id)
    with open('LOOCV.txt', 'a') as f:
        f.write(f'Val_id:{val_id}\n')
    train_ids = organization_list.copy()
    train_ids.remove(val_id)
    print(train_ids)
    with open('LOOCV.txt', 'a') as f:
        f.write(f'train_ids:{train_ids}\n')
    coeff, cox_results = coxph(organization_ids=train_ids, column_names=['Fmorph_pca_elongation', 'Fcm_inv_var'])
    print(coeff, cox_results)
    with open('LOOCV.txt', 'a') as f:
        f.write(f'coeff:{coeff}\n')
        f.write(f'cox_results:{cox_results}\n')

    # run validation
    output = validation(output_data=[coeff], organization_ids=[val_id])
    c_index.append(output)
    print(c_index)
    with open('LOOCV.txt', 'a') as f:
        f.write(f'c_index:{c_index}\n\n')

global_cindex = np.mean(c_index)
print(global_cindex)
with open('LOOCV.txt', 'a') as f:
    f.write(f'global_cindex:{global_cindex}\n')

In [None]:
## **Step 2.2.2 - validation** (if needed to run individually)

In [None]:
time_col_name = 'recurrencedays'
event_col_name = 'recurrence'
output_data = {'lp_Clinical_all': 0.58609, 'lp_Radiomics_Primary': 0.11941, 'lp_Radiomics_Node': 1.02146}
 

input_ = {
        "master": True,
        "method": "master",
        # kwargs which are inserted into the algorithm
        'kwargs': {
            'coefficients': [output_data]
            , 'time_col': time_col_name
            , 'outcome_col': event_col_name
            , 'feature_type': 'LP'
            , 'oropharynx': 'yes'
            , 'roitype': "Primary"
            , 'organization_ids': [7]
        }
    }

# Send the task to the central server
task = client.task.create(name='validation',
                               description="test validation",
                               collaboration=3,
                               organizations=[7],
                               image="varshagouthamchand/dcr_sparql_validation",
                               input=input_,
                               database='rdf'
                               )
task_id = task['id']
task_info = client.task.get(task_id)
while not task_info.get("complete"):
    task_info = client.task.get(task_id, include_results=True)
    print("Waiting for results")
    time.sleep(3)

print("Results are ready!")

result_id = task_info['id']
result_info = client.result.list(task=result_id)

validation_result = result_info['data'][0]['result']
output = validation_result['cindex'][0]
print(output)

In [None]:
# External validate task to run on multiple organizations and get the global c-index

def validation(output_data, organization_ids):
    """"""
    time_col_name = 'metastasisdays'
    event_col_name = 'metastasis'
    
    input_ = {
            "master": True,
            "method": "master",
            # kwargs which are inserted into the algorithm
            'kwargs': {
                  'coefficients': output_data
                , 'time_col': time_col_name
                , 'outcome_col': event_col_name
                , 'feature_type': 'LP'
                , 'oropharynx': 'yes'
                , 'roitype': "Primary"
                , 'organization_ids': organization_ids
            }
        }

    # Send the task to the central server
    task = client.task.create(name='validation',
                                   description="test validation",
                                   collaboration=1,
                                   organizations=[5],
                                   image="varshagouthamchand/dcr_sparql_validation",
                                   input=input_,
                                   database='rdf'
                                   )
    task_id = task['id']
    task_info = client.task.get(task_id)
    while not task_info.get("complete"):
        task_info = client.task.get(task_id, include_results=True)
        print("Waiting for results")
        time.sleep(3)

    print("Results are ready!")

    result_id = task_info['id']
    result_info = client.result.list(task=result_id)

    validation_result = result_info['data'][0]['result']
    output = validation_result['cindex'][0]
    return output, validation_result

In [None]:
organization_list = [3, 4, 5]
coeff = {'lp_Clinical_all': 0.28912, 'lp_Radiomics_Primary': 0.26349, 'lp_Radiomics_Node': 0.4964}

c_index = []
global_cindex = 0
# perform cox regression
for val_id in organization_list:
    # run validation
    output, validation_result = validation(output_data=[coeff], organization_ids=[val_id])
    c_index.append(output)
    print(val_id, validation_result, output)

global_cindex = np.mean(c_index)
print(c_index)
print(global_cindex)

In [None]:
# Submit task to get linear predictors in individual organizations 
# the coefficients from all three models to be combined as a dictionary and passed as input

time_col_name = 'overallsurvivaldays'
event_col_name = 'survival'
coefficients = {'Clinical_all': {'treatment_Chemotherapy': -0.13098, 'N1orLower': -0.94049, 'hpv_HPV Negative': -0.4653, 'hpv_HPV Positive': -2.16508},
                'Radiomics_Node': {'Fszm_lgze': -0.14667, 'Fcm_corr': 0.48828},
                'Radiomics_Primary': {'Fmorph_pca_elongation': 0.04885, 'Fcm_inv_var': -0.08079}
}

input_ = {
    'master': True,
    'method': 'master',
    'kwargs': {'oropharynx': 'yes',  
               'time_col': time_col_name,
               'outcome_col': event_col_name,
               'coefficients': coefficients, 
               'organization_ids': [2, 7, 8] # here in the list the organizations ids
               }
}

task = client.task.create(name="Querying task",
                               description="Send SPARQL queries to fetch required data based on the given feature type",
                               image="varshagouthamchand/dcr_sparql_lp",
                               collaboration=1,
                               input=input_,
                               organizations=[2],
                               database='rdf')

#print("Waiting for results")
task_id = task['id']
task_info = client.task.get(task_id)
while not task_info.get("complete"):
    task_info = client.task.get(task_id, include_results=True)
    #print("Waiting for results")
    time.sleep(3)

print("Results are ready!")

result_id = task_info['id']
result_info = client.result.list(task=result_id)

result = result_info['data'][0]['result']
print(result)