Run
1. Import
2. Select Seed Method
3. Single attribute test sets
4. Double attribute test sets
5. Multiple attribuet test sets

Use the Create test sets Method to crate customized test sets

## 1. Import

In [6]:

import pandas as pd
import numpy as np
from IPython.display import display, HTML
import os
import matplotlib.pyplot as plt

f1 = 'EOD 10 - positive lymph nodes examined continuous'
F1 = 'EOD 10 - positive lymph nodes examined'

f2 = 'EOD 10 - number of lymph nodes examined continuous'
F2 = 'EOD 10 - number of lymph nodes examined'

f3 = 'CS Tumor size continuous'
F3 = 'CS Tumor size'

f4 = 'Survived cancer for 60 months'
y = 'Survived cancer for 60 months'

grades = ['Grade 1', 'Grade 2', 'Grade 3', 'Grade 4', 'Grade 9']

%pwd
%cd '/home/tanmoysarkar/Trustworthiness/SEER/seer_data'

/raid/home/tanmoysarkar/Trustworthiness/SEER/seer_data


## 2. Select Seed Method

In [14]:
def select_seeds(count = 3):
    # this is loaded just to get the y_true
    # y_valid.npy and y_valid_normalized.npy are same
    y_true_valid = np.load('../processed_data_TS_2/y_valid.npy')
    X_valid_norm = np.load('../processed_data_TS_2/X_valid_normalized.npy')

    # Read the text file containing column names
    with open('../processed_data_TS_2/input_columns.txt', 'r') as file:
        text_file_columns = file.read().splitlines()
    text_file_columns.pop()

    # Display the DataFrame
    X_valid_norm_df = pd.DataFrame(X_valid_norm)
    X_valid_norm_df.columns = text_file_columns

    y_true_valid_df = pd.DataFrame(y_true_valid)
    y_true_valid_df.columns = ['true y']

    X_y_valid_df = pd.concat([X_valid_norm_df, y_true_valid_df], axis=1)
    
    X_y_valid_df_c1 = X_y_valid_df[X_y_valid_df['true y'] == 1]
    
    seeds = X_y_valid_df_c1.iloc[0:count].copy()
    
    return seeds, X_y_valid_df
    

## 3. Create test sets Method: Single attribute

This code segment create test set for f (example:  CS Tumor size) by varying the value from min to max range in normalized form. 

> start_value = min(attribute_val_list) <br>
> end_value = max(attribute_val_list) <br>
> increment = 0.01 <br>

In [3]:
def crate_test_set(save_path = '../Test2/Pos_lymph', 
                             file_name = 'pos_lymph_valid_norm_range_0_25_seed_',
                             f = 'EOD 10 - positive lymph nodes examined continuous', 
                             seed_count = 3, increment = 0.01):
    
    # select_seeds method loads and returns the original validation set along with seeds as df
    # the count defines how many seeds will be selected.
    seeds, X_y_valid_df = select_seeds(seed_count)
    
    attribute_val_list = X_y_valid_df[f].to_list()
    start_value = min(attribute_val_list)
    end_value = max(attribute_val_list) + 1.2 # add 1 only for tumor size as validation data doens't contain max val from raw dataset
    
    for i in range(seed_count):
        # Create instances with varying f
        new_instances = []
        df = pd.DataFrame(seeds.iloc[i].T).T.copy() # get a seed
        
        for value in pd.np.arange(start_value, end_value + increment, increment):
            new_instance = df.copy() # Clone the original DataFrame and update the f column
            new_instance[f] = value # change the f attr value
            new_instances.append(new_instance) # append to a list

        # Concatenate the list of new instances into a new DataFrame
        result_df = pd.concat(new_instances, ignore_index=True)

        del result_df['true y'] # delete y true of the seed
        
        print("seed: ", i)
        display(result_df)  # Display the result
        print(result_df.shape)
        
        # Create the directory if it doesn't exist
        if not os.path.exists(save_path):
            os.makedirs(save_path)
            print("new directory created: ", save_path)
        
        full_file_name = save_path + "/" + file_name + str(i) 
        
        result_df.to_csv(full_file_name + '_x.csv')
        result_np = np.array(result_df)
        print(result_np.shape)
        np.save(full_file_name + '_x.npy', result_np)

        y_valid_CS_Tumor_Size = [0]
        for i in range(result_np.shape[0]-1):
            y_valid_CS_Tumor_Size.append(1)
        y_valid_CS_Tumor_Size = np.array(y_valid_CS_Tumor_Size)
        print(y_valid_CS_Tumor_Size.shape)

        np.save(full_file_name + '_y.npy', y_valid_CS_Tumor_Size)
        print("File saved at: ", full_file_name)
        

### CS_Tumor_Size_valid


In [None]:
crate_test_set(save_path = '../Test2/CS_Tumor_Size', 
                         file_name = 'CS_Turmor_Size_valid_norm_range_0_42_seed_',
                         f = 'CS Tumor size continuous', 
                         seed_count = 3, increment = 0.01
                        )


### EOD 10 - positive lymph nodes examined continuous

In [None]:
crate_test_set(save_path = '../Test2/Pos_lymph', 
                         file_name = 'pos_lymph_valid_norm_range_0_25_seed_',
                         f = 'EOD 10 - positive lymph nodes examined continuous', 
                         seed_count = 3, increment = 0.01
                        )


### EOD 10 - number of lymph nodes examined continuous

In [None]:
crate_test_set(save_path = '../Test2/Num_lymph', 
                         file_name = 'num_lymph_valid_norm_range_0_10_seed_',
                         f = 'EOD 10 - number of lymph nodes examined continuous', 
                         seed_count = 3, increment = 0.01
                        )


## 4: Create Test Set: Double attribute (3D)

In [65]:
'''
save_path: where to save the test set (directory)
file_name: prefix of the saved file name
f1: feature 1
f2: feature 2
seed_count: the number of seeds. Each seed is used to create a new test set
increment_f1: increment of feature 1
increment_f2 = increment of feature 2
'''

def crate_test_set_3D(save_path = '../Test2/Pos_lymph', 
                             file_name = 'pos_lymph_valid_norm_range_0_25_seed_',
                             f1 = 'EOD 10 - positive lymph nodes examined continuous', 
                             f2 = 'EOD 10 - number of lymph nodes examined continuous', 
                             seed_count = 3, increment_f1 = 0.2, increment_f2 = 0.2):
    
    # select_seeds method loads and returns the original validation set along with seeds as df
    # the count defines how many seeds will be selected.
    seeds, X_y_valid_df = select_seeds(seed_count)
    
    attribute_val_list_f1 = X_y_valid_df[f1].to_list()
    start_value_f1 = min(attribute_val_list_f1)
    end_value_f1 = max(attribute_val_list_f1) #+ 1.2 # add 1 only for tumor size as validation data doens't contain max val from raw dataset
    
    attribute_val_list_f2 = X_y_valid_df[f2].to_list()
    start_value_f2 = min(attribute_val_list_f2)
    end_value_f2 = max(attribute_val_list_f2)
    
    for i in range(seed_count):
        # Create instances with varying f
        new_instances = []
        df = pd.DataFrame(seeds.iloc[i].T).T.copy() # get a seed
        
        for value_f1 in pd.np.arange(start_value_f1, end_value_f1 + increment_f1, increment_f1):
            new_instance = df.copy() # Clone the original DataFrame and update the f column
            new_instance[f1] = value_f1 # change the f attr value
            
            for value_f2 in pd.np.arange(start_value_f2, end_value_f2 + increment_f2, increment_f2): 
                new_instance2 = new_instance.copy()
                new_instance2[f2] = value_f2 # change the f attr value
                new_instances.append(new_instance2) # append to a list
            

        # Concatenate the list of new instances into a new DataFrame
        result_df = pd.concat(new_instances, ignore_index=True)

        del result_df['true y'] # delete y true of the seed
        
        print("seed: ", i)
        display(result_df)  # Display the result
        print(result_df.shape)
        
        # Create the directory if it doesn't exist
        if not os.path.exists(save_path):
            os.makedirs(save_path)
            print("new directory created: ", save_path)
        
        full_file_name = save_path + "/" + file_name + str(i) 
        
        result_df.to_csv(full_file_name + '_x.csv')
        result_np = np.array(result_df)
        print(result_np.shape)
        np.save(full_file_name + '_x.npy', result_np)

        y_valid_CS_Tumor_Size = [0]
        for i in range(result_np.shape[0]-1):
            y_valid_CS_Tumor_Size.append(1)
        y_valid_CS_Tumor_Size = np.array(y_valid_CS_Tumor_Size)
        print(y_valid_CS_Tumor_Size.shape)

        np.save(full_file_name + '_y.npy', y_valid_CS_Tumor_Size)
        print("File saved at: ", full_file_name)
        

In [None]:
crate_test_set_3D(save_path = '../Test2/CS_Tumor_Num_lymph', 
                             file_name = 'CS_Tumor_Num_lymph_valid_norm_range_seed_',
                             f1 = 'CS Tumor size continuous', 
                             f2 = 'EOD 10 - number of lymph nodes examined continuous', 
                             seed_count = 3, increment_f1 = 0.5, increment_f2 = 0.2)

In [None]:
crate_test_set_3D(save_path = '../Test2/CS_Tumor_Pos_lymph', 
                             file_name = 'CS_Tumor_Pos_lymph_valid_norm_range_seed_',
                             f1 = 'CS Tumor size continuous', 
                             f2 = 'EOD 10 - positive lymph nodes examined continuous', 
                             seed_count = 3, increment_f1 = 0.5, increment_f2 = 0.2)

In [None]:
crate_test_set_3D(save_path = '../Test2/CS_Tumor_Pos_lymph', 
                             file_name = 'CS_Tumor_Pos_lymph_valid_norm_range_seed_',
                             f1 = 'CS Tumor size continuous', 
                             f2 = 'EOD 10 - positive lymph nodes examined continuous', 
                             seed_count = 3, increment_f1 = 0.6, increment_f2 = 0.3)

### Grade test set 

In [42]:
'''
save_path: where to save the test set (directory)
file_name: prefix of the saved file name
f1: feature 1
f2: feature 2
seed_count: the number of seeds. Each seed is used to create a new test set
increment_f1: increment of feature 1
increment_f2 = increment of feature 2
'''


def crate_test_set_grades(save_path = '../Test2/Grades', 
                             file_name = 'Grades_valid_norm_',
                             f = "Grade 1", group = "all"):
    
    grades = ['Grade 1', 'Grade 2', 'Grade 3', 'Grade 4', 'Grade 9']
    
    # select_seeds method loads and returns the original validation set along with seeds as df
    # the count defines how many seeds will be selected.
    seeds, X_y_valid_df = select_seeds(1)
    
    if group == 1: # survived class
        X_y_valid_df = X_y_valid_df[X_y_valid_df['true y'] == 1]
    elif group == 0:
        X_y_valid_df = X_y_valid_df[X_y_valid_df['true y'] == 0]
        
    new_instances = []
    for i in range(X_y_valid_df.shape[0]):
        # Create instances with varying f
        new_instance = pd.DataFrame(X_y_valid_df.iloc[i].T).T.copy() # get a row
        
        for g in grades:
            new_instance[g] = 0 # reset all one-hot encoded grades to 0
        
        new_instance[f] = 1 # set selected grade to 1
        new_instances.append(new_instance) # append to a list

    # Concatenate the list of new instances into a new DataFrame
    result_df = pd.concat(new_instances, ignore_index=True)

    del result_df['true y'] # delete y true of the seed
    display(result_df)  # Display the result
    print(result_df.shape)

    # Create the directory if it doesn't exist
    save_path = save_path + "/" + f + "/"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        print("new directory created: ", save_path)

    full_file_name = save_path + file_name

    result_df.to_csv(full_file_name + '_x.csv')
    result_np = np.array(result_df)
    print(result_np.shape)
    np.save(full_file_name + '_x.npy', result_np)

    y_valid = [0]
    for i in range(result_np.shape[0]-1):
        y_valid.append(1)
    y_valid = np.array(y_valid)
    print(y_valid.shape)

    np.save(full_file_name + '_y.npy', y_valid)
    print("File saved at: ", full_file_name)
        

In [None]:
grades = ['Grade 1', 'Grade 2', 'Grade 3', 'Grade 4', 'Grade 9']

for g in grades:
    crate_test_set_grades(save_path = '../Test2/Grades', 
                                 file_name = 'Grades_valid_norm',
                                 f = g)

## 5. Multi-attribute test sets

In [None]:

'''
save_path: where to save the test set (directory)
file_name: prefix of the saved file name
f1: feature 1
f2: feature 2
seed_count: the number of seeds. Each seed is used to create a new test set
increment_f1: increment of feature 1
increment_f2 = increment of feature 2
'''

'''
df[f1] = df[f1]*3.281532853931098 + 1.206322788652106 
df[f2] = df[f2]* 7.364361918247256 + 6.743024148646638
df[f3] = df[f3]*22.95831813662134 + 21.392468773994878 
'''


def crate_test_set_multi(save_path = '../Test2/Multi', 
                             file_name = 'Multi_valid_norm_',
                             class_ = 2):
    
    f1 = 'EOD 10 - positive lymph nodes examined continuous'
    f2 = 'EOD 10 - number of lymph nodes examined continuous'
    f3 = 'CS Tumor size continuous'
    f4 = 'Survived cancer for 60 months'
    grades = ['Grade 1', 'Grade 2', 'Grade 3', 'Grade 4', 'Grade 9']
    

    # select_seeds method loads and returns the original validation set along with seeds as df
    # the count defines how many seeds will be selected.
    seeds, X_y_valid_df = select_seeds(1)
    
    if class_ == 1: # survived class
        X_y_valid_df = X_y_valid_df[X_y_valid_df['true y'] == 1]
    elif class_ == 0:
        X_y_valid_df = X_y_valid_df[X_y_valid_df['true y'] == 0]
        
    new_instances = []
    for i in range(X_y_valid_df.shape[0]):
        N3 = ( randrange(11, 84) - 1.2) / 3.28
        T3 = ( randrange(51, 984) - 21.39) / 22.958
        G = 'Grade 4'
    
        new_instance = pd.DataFrame(X_y_valid_df.iloc[i].T).T.copy() # get a row
        new_instance[f3] = T3
        new_instance[f1] = N3
        
        for g in grades:
            new_instance[g] = 0
        new_instance[G] = 1

        new_instances.append(new_instance) # append to a list

    # Concatenate the list of new instances into a new DataFrame
    result_df = pd.concat(new_instances, ignore_index=True)

    del result_df['true y'] # delete y true of the seed
    display(result_df)  # Display the result
    print(result_df.shape)

    # Create the directory if it doesn't exist
    save_path = save_path + "/"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        print("new directory created: ", save_path)

    full_file_name = save_path + file_name

    result_df.to_csv(full_file_name + '_x.csv')
    result_np = np.array(result_df)
    print(result_np.shape)
    np.save(full_file_name + '_x.npy', result_np)

    y_valid = [0]
    for i in range(result_np.shape[0]-1):
        y_valid.append(1)
    y_valid = np.array(y_valid)
    print(y_valid.shape)

    np.save(full_file_name + '_y.npy', y_valid)
    print("File saved at: ", full_file_name)
        


In [None]:
crate_test_set_multi(save_path = '../Test2/Multi', 
                    file_name = 'Multi_valid_norm')
