## Normalization Script

Author: Kristina Preuer

This script shows how the data was split and how the features were normalized. The data is then saved in a pickle file. Which will be loaded during the cross validation procedure.


In [77]:
import numpy as np
import pandas as pd
import pickle
import gzip

##### Define the parameters for data generation: folds for testing and validation and normalization strategy


In [78]:
# in this example tanh normalization is used
# fold 0 is used for testing and fold 1 for validation (hyperparamter selection)
norm = "tanh"
test_fold = 0
val_fold = 1

#### Define nomalization function

It normalizes the input data X. If X is used for training the mean and the standard deviation is calculated during normalization. If X is used for validation or testing, the previously calculated mean and standard deviation of the training data should be used. If "tanh_norm" is used as normalization strategy, then the mean and standard deviation are calculated twice. The features with a standard deviation of 0 are filtered out.


#### Load features and labels


In [79]:
# contains the data in both feature ordering ways (drug A - drug B - cell line and drug B - drug A - cell line)
# in the first half of the data the features are ordered (drug A - drug B - cell line)
# in the second half of the data the features are ordered (drug B - drug A - cell line)
file = gzip.open("/hpc2hdd/home/mgong081/Projects/DeepSynergy/raw_data/X.p.gz", "rb")
X = pickle.load(file)
file.close()

In [80]:
# contains synergy values and fold split (numbers 0-4)
labels = pd.read_csv(
    "/hpc2hdd/home/mgong081/Projects/DeepSynergy/raw_data/labels.csv", index_col=0
)
# # labels are duplicated for the two different ways of ordering in the data
# labels = pd.concat([labels, labels])

#### Extract drug A, drug B and cell line features


In [81]:
import numpy as np

# Examine with the known data. Find the first non-identical column from the end of two matrices


# X1 drug A - drug B - cell line
X1 = X[: len(X) // 2]
X2 = X[len(X) // 2 :]


def find_consecutive_identical_columns(X1, X2):
    # Number of columns in X1 and X2
    num_cols = X1.shape[1]

    # Initialize the index of the first non-identical column (from the end)
    first_non_identical_idx = None

    # Iterate over columns from the end to the beginning
    for i in range(num_cols - 1, -1, -1):
        # Compare the ith column in both matrices
        if not np.allclose(X1[:, i], X2[:, i]):
            first_non_identical_idx = i + 1
            break

    # If all columns are identical, then first_non_identical_idx will be None
    if first_non_identical_idx is None:
        return X1, X2  # All columns are identical
    else:
        # Return the identical columns from the end to the first non-identical index
        return X1[:, first_non_identical_idx:], X2[:, first_non_identical_idx:]


identical_columns_X1, identical_columns_X2 = find_consecutive_identical_columns(X1, X2)

# Define feature counts
drug_A_feature_count = 1309 + 802 + 2276
drug_B_feature_count = 1309 + 802 + 2276
cell_line_feature_count = 3984

# Calculate indices for slicing
drug_A_end = drug_A_feature_count
drug_B_start = drug_A_end
drug_B_end = drug_B_start + drug_B_feature_count
cell_line_start = drug_B_end

# # Extract features
# drug_A_features = X1[:, 0:drug_A_end]
# drug_B_features = X1[:, drug_B_start:drug_B_end]
# cell_line_features = X1[:, cell_line_start:]

#### Define indices for splitting


In [82]:
# indices of training data for hyperparameter selection: fold 2, 3, 4
idx_tr = np.where(
    np.logical_and(labels["fold"] != test_fold, labels["fold"] != val_fold)
)
# indices of validation data for hyperparameter selection: fold 1
idx_val = np.where(labels["fold"] == val_fold)

In [83]:
# indices of training data for model testing: fold 1, 2, 3, 4
idx_train = np.where(labels["fold"] != test_fold)
# indices of test data for model testing: fold 0
idx_test = np.where(labels["fold"] == test_fold)

#### Split data


In [84]:
X_tr = X1[idx_tr]
X_val = X1[idx_val]
X_train = X1[idx_train]
X_test = X1[idx_test]

In [85]:
y_tr = labels.iloc[idx_tr]["synergy"].values
y_val = labels.iloc[idx_val]["synergy"].values
y_train = labels.iloc[idx_train]["synergy"].values
y_test = labels.iloc[idx_test]["synergy"].values

In [86]:
X_tr_drug_A = X_tr[:, 0:drug_A_end]
X_tr_drug_B = X_tr[:, drug_B_start:drug_B_end]
X_tr_cell_line = X_tr[:, cell_line_start:]

X_val_drug_A = X_val[:, 0:drug_A_end]
X_val_drug_B = X_val[:, drug_B_start:drug_B_end]
X_val_cell_line = X_val[:, cell_line_start:]

X_train_drug_A = X_train[:, 0:drug_A_end]
X_train_drug_B = X_train[:, drug_B_start:drug_B_end]
X_train_cell_line = X_train[:, cell_line_start:]

X_test_drug_A = X_test[:, 0:drug_A_end]
X_test_drug_B = X_test[:, drug_B_start:drug_B_end]
X_test_cell_line = X_test[:, cell_line_start:]

#### Normalize training and validation data for hyperparameter selection


In [87]:
def normalize(
    X, means1=None, std1=None, means2=None, std2=None, feat_filt=None, norm="tanh_norm"
):
    if std1 is None:
        std1 = np.nanstd(X, axis=0)
    if feat_filt is None:
        feat_filt = std1 != 0
    X = X[:, feat_filt]
    X = np.ascontiguousarray(X)
    if means1 is None:
        means1 = np.mean(X, axis=0)
    X = (X - means1) / std1[feat_filt]
    if norm == "norm":
        return (X, means1, std1, feat_filt)
    elif norm == "tanh":
        return (np.tanh(X), means1, std1, feat_filt)
    elif norm == "tanh_norm":
        X = np.tanh(X)
        if means2 is None:
            means2 = np.mean(X, axis=0)
        if std2 is None:
            std2 = np.std(X, axis=0)
        X = (X - means2) / std2
        X[:, std2 == 0] = 0
        return (X, means1, std1, means2, std2, feat_filt)

#### Normalize


In [88]:
# Normalize drug A/B and cell line
# Tr
X_tr_drug_A, mean_drug_A, std_drug_A, feat_filt_drug_A = normalize(
    X_tr_drug_A, norm=norm
)
X_tr_drug_B, mean_drug_B, std_drug_B, feat_filt_drug_B = normalize(
    X_tr_drug_B, norm=norm
)
X_tr_cell_line, mean_cell_line, std_cell_line, feat_filt_cell_line = normalize(
    X_tr_cell_line, norm=norm
)

# Val
X_val_drug_A, mean_drug_A, std_drug_A, feat_filt_drug_A = normalize(
    X_val_drug_A, mean_drug_A, std_drug_A, feat_filt_drug_A, norm=norm
)
X_val_drug_B, mean_drug_B, std_drug_B, feat_filt_drug_B = normalize(
    X_val_drug_B, mean_drug_B, std_drug_B, feat_filt_drug_B, norm=norm
)
X_val_cell_line, mean_cell_line, std_cell_line, feat_filt_cell_line = normalize(
    X_val_cell_line, mean_cell_line, std_cell_line, feat_filt_cell_line, norm=norm
)

# Train
X_train_drug_A, mean_drug_A, std_drug_A, feat_filt_drug_A = normalize(
    X_train_drug_A, mean_drug_A, std_drug_A, feat_filt_drug_A, norm=norm
)
X_train_drug_B, mean_drug_B, std_drug_B, feat_filt_drug_B = normalize(
    X_train_drug_B, mean_drug_B, std_drug_B, feat_filt_drug_B, norm=norm
)
X_train_cell_line, mean_cell_line, std_cell_line, feat_filt_cell_line = normalize(
    X_train_cell_line, mean_cell_line, std_cell_line, feat_filt_cell_line, norm=norm
)

# Test
X_test_drug_A, mean_drug_A, std_drug_A, feat_filt_drug_A = normalize(
    X_test_drug_A, mean_drug_A, std_drug_A, feat_filt_drug_A, norm=norm
)
X_test_drug_B, mean_drug_B, std_drug_B, feat_filt_drug_B = normalize(
    X_test_drug_B, mean_drug_B, std_drug_B, feat_filt_drug_B, norm=norm
)
X_test_cell_line, mean_cell_line, std_cell_line, feat_filt_cell_line = normalize(
    X_test_cell_line, mean_cell_line, std_cell_line, feat_filt_cell_line, norm=norm
)

In [89]:
print(X_tr_drug_A.shape, X_tr_drug_B.shape, X_tr_cell_line.shape)
X_val_drug_A.shape[1] + X_val_drug_B.shape[1] + X_val_cell_line.shape[1]

(13884, 2353) (13884, 2334) (13884, 3984)


8671

In [90]:
# Save
pickle.dump(
    (
        X_tr_drug_A,
        X_tr_drug_B,
        X_tr_cell_line,
        X_val_drug_A,
        X_val_drug_B,
        X_val_cell_line,
        X_train_drug_A,
        X_train_drug_B,
        X_train_cell_line,
        X_test_drug_A,
        X_test_drug_B,
        X_test_cell_line,
        y_tr,
        y_val,
        y_train,
        y_test,
    ),
    open("data/3mlp_data_test_fold%d_%s.p" % (test_fold, norm), "wb"),
)