In [1]:
import csv
import itertools
import multiprocessing
import sys
import time

import numpy as np

import optimiser

sys.path.append('../moma')
import moma_data_gateway


In [2]:
DATA_DIR = "../../../data/models/moma/"

testing_data_indices_file_path = DATA_DIR + "indices_for_testing_data.csv"
complete_data_file_path = DATA_DIR + "complete_dataset.RDS"
gene_expression_data_file_path = DATA_DIR + "gene_expression_dataset.RDS"

preprocessed_data: moma_data_gateway.MomaDataclass = moma_data_gateway.get_preprocessed_data(
    testing_data_indices_file_path=testing_data_indices_file_path, 
    complete_data_file_path=complete_data_file_path, 
    gene_expression_data_file_path=gene_expression_data_file_path
)

scaled_gene_expression_training_data = preprocessed_data.scaled_gene_expression_training_data
scaled_gene_expression_testing_data = preprocessed_data.scaled_gene_expression_testing_data
scaled_flux_training_data = preprocessed_data.scaled_flux_training_data
scaled_flux_testing_data = preprocessed_data.scaled_flux_testing_data
target_training_data = preprocessed_data.target_training_data
target_testing_data = preprocessed_data.target_testing_data

Success in loading complete data
Shape of the complete data: (1143, 9666)
Success in loading gene expression data
Shape of the gene expression data: (1143, 6171)


In [3]:
X_train = np.concatenate((scaled_gene_expression_training_data, scaled_flux_training_data), axis=1)
X_test = np.concatenate((scaled_gene_expression_testing_data, scaled_flux_testing_data), axis=1)

Y_train: np.ndarray = target_training_data
Y_test: np.ndarray = target_testing_data

# Get the indices for the groups of the fluxes and the genes
flux_dimensions_index = scaled_flux_training_data.shape[1] - 1 #1847
gene_dimensions_index = scaled_gene_expression_training_data.shape[1] - 1 # 6169

indices_fluxes = list(np.arange(0, flux_dimensions_index + 1))
indices_gene_expression= list(np.arange(flux_dimensions_index + 1, gene_dimensions_index + 1))

In [4]:
# This section is for the hyperparameter tuning. We vary the lambda values and the intercept values. 

# lambda_values: np.ndarray = np.logspace(-0.6, 0.7, 12) # from 10**-0.6 to 10**0.7 (12 values)
lambda_values: np.ndarray = np.logspace(
    -0.6, 0.7, 12
)  # from 10**-0.6 to 10**0.7 (12 values)


# intercept_values: np.ndarray = np.logspace(-1.6, 0.04, 12) # from 10**-1.6 to 10**0.04 (12 values)
intercept_values: np.ndarray = np.logspace(
    -1.6, 0.04, 12
)  # from 10**-1.6 to 10**0.04 (12 values)


#  the aim is to experiment with how different levels of baseline adjustments affect the performance of the model
lambdas_gene_expression: np.ndarray = np.repeat(
    lambda_values, 1 * 1
)  # 12 * 12 * 12 = 1728)
print(lambdas_gene_expression.shape)

# lambdas_fluxes: np.ndarray = np.repeat(lambda_values, len(intercept_values))  # 12 * 12 = 144
lambdas_fluxes: np.ndarray = np.repeat(
    lambda_values, 1
)  # 12 * 12 = 144

lambdas_fluxes: np.ndarray = np.asarray(
    [lambdas_fluxes[:] for _ in range(1)]
).flatten()  # (1728,)
print(lambdas_fluxes.shape)

intercepts: np.ndarray = np.asarray(
    [intercept_values for _ in range(1** 2)]
).flatten() # (1728,)
print(intercepts.shape)

(12,)
(12,)
(12,)


In [5]:
# Here, we skip the hyperparameter tuning and go straight to the model training.
# Define lambda value
lambda_value = np.logspace(-0.6, 0.7, 12)[7]  # Single lambda value from the logspace array
# Chosen arbitrarily

# Define intercept value
intercept_value = np.logspace(-1.6, 0.04, 12)[4]  # Single intercept value from the logspace array
# Chosen arbitrarily

In [10]:
if __name__ == "__main__":

    start_time = time.time()
    print("Starting process...")

    results = []
    repeats = 1

    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        result = pool.starmap(
            optimiser.optimise,
            zip(
                range(repeats),
                itertools.repeat(lambda_value),
                itertools.repeat(lambda_value),
                itertools.repeat(intercept_value),
                itertools.repeat(X_train),
                itertools.repeat(Y_train),
                itertools.repeat(X_test),
                itertools.repeat(Y_test),
                itertools.repeat(flux_dimensions_index),
                itertools.repeat(gene_dimensions_index),
                itertools.repeat(indices_fluxes),
                itertools.repeat(indices_gene_expression),
            ),
        )
        results.append(result)

    data_lasso_dir = "../../../data/models/lasso/"
    # Save the results to csv file. The results are a list of dictionaries. The headers of this file should be the keys of the dictionar
    with open(data_lasso_dir + "results_lasso.csv", "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=results[0][0].keys())
        writer.writeheader()
        for result in results[0]:
            writer.writerow(result)

    print(
        "Finished. Total time required (in minutes): ", (time.time() - start_time) / 60
    )

Starting process...
Process: 0, Lambda GE: 1.6876124757881474, Lambda MF: 1.6876124757881474, Intercept: 0.09916619195386764




Set parameter WLSAccessID
Set parameter WLSSecret
Set parameter LicenseID to value 2492909
Academic license 2492909 - for non-commercial use only - registered to pg___@ic.ac.uk
Process: 0, Total time in minutes: 0.9369114160537719, Train error:0.0014220422035324171, Test error: 0.010653044147426445, Train R^2: 0.9629473013171864, Test R^2: 0.8034388914091527
Beta GE:  0 6.714821423650076e-12
Beta GE:  1 -2.267977969265375e-12
Beta GE:  2 8.467813154626841e-12
Beta GE:  3 8.271400993201676e-12
Beta GE:  4 1.2130251011512549e-09
Beta GE:  5 5.920412619119164e-11
Beta GE:  6 1.473138679793405e-11
Beta GE:  7 1.1091794072487311e-11
Beta GE:  8 -5.341737478333473e-12
Beta GE:  9 -1.774913070697228e-11
Beta GE:  10 -1.222346163974584e-11
Beta GE:  11 -1.8533017490873555e-11
Beta GE:  12 -2.6083803337612006e-12
Beta GE:  13 -1.9899802252634319e-10
Beta GE:  14 -1.2949844793683347e-12
Beta GE:  15 -1.7724545897332081e-12
Beta GE:  16 2.902536859993308e-11
Beta GE:  17 -1.2359838647007473e-11
B

In [7]:
# assuming results contains one solution for one set of parameters
beta_ge = results[0][0]["beta_ge"]
beta_mf = results[0][0]["beta_mf"]
print("len(beta_ge): ", len(beta_ge)) # 1848
print("len(beta_mf): ", len(beta_mf)) # 6170

len(beta_ge):  1848
len(beta_mf):  6170


In [8]:
# save beta_ge to csv file (as one column)
with open(data_lasso_dir + "gene_expression_coefficients.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for val in beta_ge:
        writer.writerow([val])

# save beta_mf to csv file (as one column)
with open(data_lasso_dir + "flux_coefficients.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for val in beta_mf:
        writer.writerow([val])

In [9]:
# Assume beta_gene_expression.value and beta_fluxes.value hold the optimal values from the optimization.
# Also, assume intercept_param.value holds the optimal intercept value.

# Function to make predictions with a linear model
def predict(X_data: np.ndarray, beta_gene_expression: list[float], beta_fluxes: list[float], intercept: int) -> np.ndarray:
    """Make predictions with a linear model.

    Parameters
    ----------
    X_data : np.ndarray
        Data matrix.
    beta_gene_expression : list[float]
        Coefficients for gene expression data.
    beta_fluxes : list[float]
        Coefficients for flux data.
    intercept : int
        Intercept value.

    Returns
    -------
    np.ndarray
        Predictions.
    """
    # Combine gene expression and flux coefficients
    beta = np.hstack([beta_gene_expression, beta_fluxes])

    # Compute predictions: X_new.dot(beta) + intercept
    predictions = X_data.dot(beta) + intercept
    return predictions

# Make predictions
predictions = predict(
    X_data=X_test, beta_gene_expression=beta_ge, beta_fluxes=beta_mf, intercept=intercept_value
)

print("Predictions:", predictions)

Predictions: [ 5.20502095e-02  2.65753284e-02  5.23591454e-02  7.95331628e-02
 -1.16452325e-03  2.44080915e-01  1.46724273e-01  1.17459794e-01
  5.20468477e-02 -3.62047878e-02  5.27491442e-02  1.65531823e-01
  4.51904792e-02  2.46580635e-02  3.52625471e-02  3.77199287e-02
 -9.61500382e-03  8.57721263e-02  5.14905977e-02 -1.81696741e-02
  5.14908532e-02  1.54713633e-01  8.20230401e-01  9.33846215e-02
 -2.49900537e-02  1.85152674e-03 -1.06276480e-02  6.66197042e-02
 -6.41543350e-03  3.49125963e-02  9.65627208e-01 -4.81981533e-02
  1.21016821e-02 -8.47138409e-02 -4.45064164e-02  4.61462204e-01
  8.25136574e-01  4.78369945e-01 -3.02747533e-03  2.11782759e-01
  5.56194087e-02 -7.49951018e-03 -7.38139894e-03  3.47553720e-02
  1.46869481e-01  1.08874005e-02  3.19530819e-02 -1.89071506e-02
  1.06660607e-01  3.32507645e-01  4.01327759e-02 -2.93273590e-02
  2.13753433e-02  4.06401439e-02  4.62159747e-01  3.94018936e-02
  4.76601798e-01  6.65881878e-03  5.27232284e-02  3.73422938e-02
  9.73887782