# Create simulated replication data for original HCC dataset

In [1]:
import pandas as pd
import numpy as np

random_seed = 42
np.random.seed(seed=random_seed)

# Specify the path to your CSV dataset
csv_file = "/content/hcc-data_example.csv"

column_to_exclude = 'InstanceID'

# Specify the proportion of instances to replace (between 0 and 1)
replace_proportion = 0.3

# Find the last occurrence of the directory separator '/'
last_separator_index = csv_file.rfind('/')

# Extract the file path
file_path = csv_file[:last_separator_index] if last_separator_index != -1 else ''

# Find the extension separator '.'
extension_separator_index = csv_file.rfind('.')

# Extract the file name without the extension
file_name = csv_file[last_separator_index + 1 : extension_separator_index] if last_separator_index != -1 else csv_file[:extension_separator_index]

# Print the extracted file path and file name
print("File Path:", file_path)
print("File Name:", file_name)

# Load the CSV dataset
data = pd.read_csv(csv_file)

# Select random instances to replace
replace_indices = np.random.choice(len(data), size=int(len(data) * replace_proportion), replace=False)

# Iterate over the selected indices and replace feature values
for index in replace_indices:
    # Get the values of the current instance
    instance_values = data.iloc[index, :]

    # Iterate over the features
    for feature in instance_values.index:
        # Check if the current feature is the one to exclude
        if feature == column_to_exclude:
            # Change the value of the excluded feature
            data.at[index, feature] = str(instance_values[feature]) + "_random"
        else:
            # Compute the distribution of the feature values in the rest of the dataset
            feature_distribution = data[data.index != index][feature]

            # Generate a new feature value that resembles the rest of the dataset
            new_value = np.random.choice(feature_distribution)

            # Assign the new feature value to the current instance
            data.at[index, feature] = new_value

# Save the updated dataset with the simulated features
output_file = file_path+'/'+file_name+'_rep.csv'
data.to_csv(output_file, index=False)
print(f"Updated dataset saved to {output_file}.")


File Path: /content
File Name: hcc-data_example
Updated dataset saved to /content/hcc-data_example_rep.csv.


# Create Custom HCC dataset for STREAMLINE Testing (with some text variables)
With randomly simulated instances and features to test specific data challenges that may be encountered.
* Instances with a missing class label
* Instances with a high percent of missingness
* Simulate numerically encoded categorical features (with 2, 3,or 4 state values)
* Simulate text-value categorical features (with 2, 3,or 4 state values)
* Simulate quantiative features with high missingness
* Simulate pairs of features with high correlations between them
  * Both positive and negative correlations

In [2]:
import pandas as pd
import numpy as np

random_seed = 42
np.random.seed(seed=random_seed)

class_label = 'Class'
instance_ID_label = 'InstanceID'
features_to_remove = ['Gender','Age at diagnosis']
cat_feature_list = [2,3,4]
cat_feature_list_text = [2,3,4]
miss_feature_list = [0.6,0.7]
corr_feature_list = [-1.0,0.9,1.0]
num_nolabel_instances = 2
miss_instance_list = [0.7,0.8]

# Specify the path to your existing CSV dataset
csv_file = "/content/hcc-data_example.csv"

def remove_features(data, features_to_remove):
    return data.drop(features_to_remove, axis=1)

def generate_categorical_feature(num_categories, num_rows):
    categories = [f"Category {i+1}" for i in range(num_categories)]
    return np.random.choice(categories, size=num_rows)

def generate_categorical_feature_numerical_encode(num_categories, num_rows):
    categories = np.arange(1, num_categories + 1)
    return np.random.choice(categories, size=num_rows)

def generate_quantitative_feature(num_rows, missing_percentage):
    data = np.random.rand(num_rows)
    missing_mask = np.random.choice([False, True], size=num_rows, p=[1-missing_percentage, missing_percentage])
    data[missing_mask] = np.nan
    return data

def generate_correlated_values(feature1, correlation):
    # Generate the second feature correlated with the first feature
    feature2 = correlation * feature1 + np.random.normal(0, np.sqrt(1 - correlation**2), len(feature1))
    return feature2

# Find the last occurrence of the directory separator '/'
last_separator_index = csv_file.rfind('/')

# Extract the file path
file_path = csv_file[:last_separator_index] if last_separator_index != -1 else ''

# Find the extension separator '.'
extension_separator_index = csv_file.rfind('.')

# Extract the file name without the extension
file_name = csv_file[last_separator_index + 1 : extension_separator_index] if last_separator_index != -1 else csv_file[:extension_separator_index]

# Print the extracted file path and file name
print("File Path:", file_path)
print("File Name:", file_name)

# Load the existing CSV dataset
data = pd.read_csv(csv_file)

# Remove specified features from original dataset
data = remove_features(data, features_to_remove)

# Generate random instances resembling existing dataset and add to the dataset that have missing class label
i = 0
for _ in range(num_nolabel_instances):
    random_instance = data.sample(n=1, replace=True)
    random_instance[class_label] = np.nan
    random_instance[instance_ID_label] = 'no_class_'+str(i)
    data = pd.concat([data, random_instance], ignore_index=True)
    i += 1

# Generate random instances resembling existing dataset but have some percentage of missingness
i = 0
for miss in miss_instance_list:
    random_instance = data.sample(n=1, replace=True)
    num_features = len(data.columns)
    num_missing_values = int(num_features * miss)
    random_features = np.random.choice(data.columns, size=num_missing_values, replace=False)
    random_instance[random_features] = np.nan
    random_instance[instance_ID_label] = 'miss_'+str(i)+'_'+str(miss)
    # Randomly choose a value of 0 or 1
    value = np.random.choice([0, 1])
    random_instance[class_label] = value
    data = pd.concat([data, random_instance], ignore_index=True)
    i += 1

# Simulate the categorical feature and add it to the dataset
num_rows = len(data)
for cat in cat_feature_list:
    simulated_categorical_feature = generate_categorical_feature_numerical_encode(cat, num_rows)
    data['Sim_Cat_'+str(cat)] = simulated_categorical_feature

# Simulate the text-based categorical feature and add it to the dataset
num_rows = len(data)
for cat in cat_feature_list_text:
    simulated_categorical_feature = generate_categorical_feature(cat, num_rows)
    data['Sim_Text_Cat_'+str(cat)] = simulated_categorical_feature

# Simulate the quantitative feature and add it to the dataset
for miss in miss_feature_list:
    simulated_quant_feature = generate_quantitative_feature(num_rows, miss)
    data['Sim_Miss_'+str(miss)] = simulated_quant_feature

# Simulate the correlated variables and add them to the dataset
for corr in corr_feature_list:
    # Generate a new random quantitative feature
    new_feature = np.random.rand(len(data))

    # Generate the correlated feature based on the new feature
    correlated_feature = generate_correlated_values(new_feature, corr)

    # Add the new features to the dataset
    data['Sim_Cor_'+str(corr)+'_A'] = new_feature
    data['Sim_Cor_'+str(corr)+'_B'] = correlated_feature

# Save the updated dataset with the simulated features
output_file = file_path+'/'+file_name+'_custom.csv'
data.to_csv(output_file, index=False)
print(f"Updated dataset saved to {output_file}.")

File Path: /content
File Name: hcc-data_example
Updated dataset saved to /content/hcc-data_example_custom.csv.


# Create replication dataset for the custom HCC dataset



In [3]:
import pandas as pd
import numpy as np

random_seed = 42
np.random.seed(seed=random_seed)

# Specify the path to your CSV dataset
csv_file = "/content/hcc-data_example_custom.csv"

column_to_exclude = 'InstanceID'

num_sim_instances = 4 #Number of made up instances generated in original HCC dataset (at end of the dataset) -these will not be randomized to preserve them as examples of data challenges

# Specify the proportion of instances to replace (between 0 and 1)
replace_proportion = 0.3

# Find the last occurrence of the directory separator '/'
last_separator_index = csv_file.rfind('/')

# Extract the file path
file_path = csv_file[:last_separator_index] if last_separator_index != -1 else ''

# Find the extension separator '.'
extension_separator_index = csv_file.rfind('.')

# Extract the file name without the extension
file_name = csv_file[last_separator_index + 1 : extension_separator_index] if last_separator_index != -1 else csv_file[:extension_separator_index]

# Print the extracted file path and file name
print("File Path:", file_path)
print("File Name:", file_name)

# Load the CSV dataset
data = pd.read_csv(csv_file)

# Select random instances to replace
replace_indices = np.random.choice(len(data)-num_sim_instances, size=int(len(data) * replace_proportion), replace=False)

# Iterate over the selected indices and replace feature values
for index in replace_indices:
    # Get the values of the current instance
    instance_values = data.iloc[index, :]

    # Iterate over the features
    for feature in instance_values.index:
        # Check if the current feature is the one to exclude
        if feature == column_to_exclude:
            # Change the value of the excluded feature
            data.at[index, feature] = str(instance_values[feature]) + "_random"
        else:
            # Compute the distribution of the feature values in the rest of the dataset
            feature_distribution = data[data.index != index][feature]

            # Generate a new feature value that resembles the rest of the dataset
            new_value = np.random.choice(feature_distribution)

            # Assign the new feature value to the current instance
            data.at[index, feature] = new_value

# Generate random instance that includes a new categorical value in binary text categorical feature
random_instance = data.sample(n=1, replace=True)
# Randomly choose a value of 0 or 1
value = np.random.choice([0, 1])
random_instance[class_label] = value
random_instance[instance_ID_label] = 'new_val_cat_text_binary'
random_instance['Sim_Text_Cat_2'] = 'Category 5' # assign new value 
data = pd.concat([data, random_instance], ignore_index=True)

# Generate random instance that includes a new categorical value in 3-value text categorical feature
random_instance = data.sample(n=1, replace=True)
# Randomly choose a value of 0 or 1
value = np.random.choice([0, 1])
random_instance[class_label] = value
random_instance[instance_ID_label] = 'new_val_cat_text_3'
random_instance['Sim_Text_Cat_3'] = 'Category 7' # assign new value 
data = pd.concat([data, random_instance], ignore_index=True)

# Generate random instance that includes a new categorical value in binary categorical feature
random_instance = data.sample(n=1, replace=True)
# Randomly choose a value of 0 or 1
value = np.random.choice([0, 1])
random_instance[class_label] = value
random_instance[instance_ID_label] = 'new_val_cat_binary'
random_instance['Sim_Cat_2'] = 7 # assign new value 
data = pd.concat([data, random_instance], ignore_index=True)

# Generate random instance that includes a new categorical value in 3-value categorical feature
random_instance = data.sample(n=1, replace=True)
# Randomly choose a value of 0 or 1
value = np.random.choice([0, 1])
random_instance[class_label] = value
random_instance[instance_ID_label] = 'new_val_cat_3'
random_instance['Sim_Cat_3'] = 16 # assign new value 
data = pd.concat([data, random_instance], ignore_index=True)

# Save the updated dataset with the simulated features
output_file = file_path+'/'+file_name+'_rep.csv'
data.to_csv(output_file, index=False)
print(f"Updated dataset saved to {output_file}.")

File Path: /content
File Name: hcc-data_example_custom
Updated dataset saved to /content/hcc-data_example_custom_rep.csv.
