In [1]:
import pathlib
import pandas as pd
import numpy as np

from joblib import load

# Set the seed
rng = np.random.default_rng(0)

In [2]:
def shuffle_data(df):
    """
    Shuffle the feature columns of the input dataframe independently while keeping metadata columns unchanged.
    Columns with 'Metadata' prefix are considered metadata.

    Parameters
    ----------
    df : pandas.DataFrame
        Input dataframe containing both features and metadata.
    """
    feature_columns = [col for col in df.columns if not col.startswith('Metadata')]

    shuffled_df = df.copy()
    
    for column in feature_columns:
        shuffled_df[column] = rng.permutation(shuffled_df[column])
    
    return shuffled_df

In [3]:
# Path to encoder
le_path = pathlib.Path("./data/trained_nf1_model_label_encoder.joblib")

# Path to model
model_path = pathlib.Path("./data/trained_nf1_model.joblib")

# Load in encoder
le = load(le_path)

# Load in NF1 model
model = load(model_path)


In [4]:
# Load in the model data
model_df = pd.read_parquet(pathlib.Path("./model_data.parquet"))

meta_cols = model_df.filter(like="Metadata").columns

print(model_df.shape)
model_df.head()

(21370, 870)


Unnamed: 0,Nuclei_RadialDistribution_ZernikeMagnitude_GFP_8_6,Nuclei_AreaShape_Zernike_7_1,Cells_RadialDistribution_ZernikeMagnitude_RFP_6_0,Nuclei_RadialDistribution_ZernikeMagnitude_DAPI_5_5,Nuclei_RadialDistribution_ZernikePhase_GFP_9_1,Nuclei_Texture_Correlation_GFP_3_01_256,Cytoplasm_RadialDistribution_ZernikeMagnitude_CY5_3_3,Nuclei_Texture_Correlation_DAPI_3_02_256,Cells_Intensity_MADIntensity_DAPI,Cytoplasm_RadialDistribution_MeanFrac_RFP_3of4,...,Nuclei_RadialDistribution_FracAtD_CY5_4of4,Cells_RadialDistribution_ZernikePhase_DAPI_9_5,Cytoplasm_AreaShape_Zernike_8_4,Cells_RadialDistribution_FracAtD_DAPI_3of4,Cytoplasm_Correlation_RWC_DAPI_CY5,Nuclei_RadialDistribution_ZernikeMagnitude_CY5_9_7,Cytoplasm_RadialDistribution_ZernikePhase_CY5_7_1,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Cytoplasm_AreaShape_Zernike_4_0,Metadata_datasplit
0,-1.063849,1.479502,-0.479305,-0.093464,1.440293,0.878041,0.518878,-1.007931,-0.460367,0.843777,...,0.213263,0.230531,-0.84532,-2.026702,0.270841,-0.762607,-0.16067,-2.915689,-1.243646,rest
1,-0.969352,-0.754828,0.31779,0.312964,-1.542194,1.401663,0.797627,0.128219,1.412492,0.187499,...,-2.09761,-1.229214,0.379174,0.464392,0.734386,0.456039,-1.063727,0.016138,-0.626757,rest
2,-0.083428,1.610322,0.319736,-0.172855,-0.288915,0.159513,-0.024944,-1.034824,-0.103696,0.787885,...,-1.104407,-0.961465,-1.543201,0.880911,0.873302,0.038485,1.412895,1.096584,1.12884,rest
3,1.919927,-0.85312,3.963571,-1.031434,-1.399657,-0.282898,-0.164698,0.098882,-0.303052,0.472728,...,0.494178,0.892562,0.872675,-0.04579,-0.935511,0.065476,1.319694,0.932735,0.099949,rest
4,0.184429,0.403966,0.313153,0.054143,-0.898072,-0.609744,-0.309397,0.20353,-0.14845,-1.693646,...,-0.01652,0.024378,0.824451,0.109087,0.035524,-0.197065,0.360104,-0.753957,0.245045,rest


In [5]:
# Decode the true genotypes using the encoder
true_genotypes = le.fit_transform(model_df["Metadata_genotype"])

probabilitydf = pd.DataFrame(
    {
        f"probability_{le.inverse_transform([1])[0]}": model.predict_proba(model_df[model.feature_names_in_])[:, 1],
        "predicted_genotype": model.predict(model_df[model.feature_names_in_]),
        "true_genotype": true_genotypes
    }
)

probabilitydf = pd.concat([probabilitydf, model_df[meta_cols].reset_index(drop=True)], axis=1)

# Rename 'Metadata_datasplit' to 'datasplit'
probabilitydf.rename(columns={'Metadata_datasplit': 'datasplit'}, inplace=True)

# Move 'datasplit' to the start of the DataFrame
cols = ['datasplit'] + [col for col in probabilitydf if col != 'datasplit']
probabilitydf = probabilitydf[cols]

print(probabilitydf.shape)
probabilitydf.head()

(21370, 21)


Unnamed: 0,datasplit,probability_WT,predicted_genotype,true_genotype,Metadata_Cells_Number_Object_Number,Metadata_ImageNumber,Metadata_Cells_Location_Center_Y,Metadata_Nuclei_Number_Object_Number,Metadata_WellRow,Metadata_Cytoplasm_Parent_Nuclei,...,Metadata_number_of_singlecells,Metadata_WellCol,Metadata_Plate,Metadata_Nuclei_Location_Center_Y,Metadata_Well,Metadata_Cells_Location_Center_X,Metadata_Nuclei_Location_Center_X,Metadata_Site,Metadata_gene_name,Metadata_Cytoplasm_Parent_Cells
0,rest,0.127298,0,0,2.0,1177.0,317.145584,3.0,F,3.0,...,149,9,Plate_5,356.292837,F9,875.270222,902.972299,14,NF1,2.0
1,rest,0.086884,0,0,5.0,32.0,318.523005,6.0,B,6.0,...,148,11,Plate_5,311.350248,B11,606.513594,615.165938,1,NF1,5.0
2,rest,0.634054,1,0,1.0,290.0,564.884298,1.0,C,1.0,...,119,12,Plate_5,556.116579,C12,345.700749,394.21324,17,NF1,1.0
3,rest,0.003741,0,0,9.0,1016.0,713.809386,9.0,F,9.0,...,125,12,Plate_5,700.850537,F12,276.998649,288.7141,6,NF1,9.0
4,rest,0.161921,0,0,7.0,534.0,551.984447,9.0,D,9.0,...,106,12,Plate_5,582.018333,D12,924.846837,947.446667,16,NF1,7.0


In [6]:
probabilitydf["Metadata_Plate"].unique()

array(['Plate_5', 'Plate_3', 'Plate_3_prime', 'Plate_4'], dtype=object)

## Shuffle the data and apply model

In [7]:
model_shuffled_df = shuffle_data(model_df)

meta_cols = model_shuffled_df.filter(like="Metadata").columns

model_shuffled_df['Metadata_datasplit'] = 'shuffled_' + model_shuffled_df['Metadata_datasplit'].astype(str)

print(model_shuffled_df.shape)
model_shuffled_df.head()

(21370, 870)


Unnamed: 0,Nuclei_RadialDistribution_ZernikeMagnitude_GFP_8_6,Nuclei_AreaShape_Zernike_7_1,Cells_RadialDistribution_ZernikeMagnitude_RFP_6_0,Nuclei_RadialDistribution_ZernikeMagnitude_DAPI_5_5,Nuclei_RadialDistribution_ZernikePhase_GFP_9_1,Nuclei_Texture_Correlation_GFP_3_01_256,Cytoplasm_RadialDistribution_ZernikeMagnitude_CY5_3_3,Nuclei_Texture_Correlation_DAPI_3_02_256,Cells_Intensity_MADIntensity_DAPI,Cytoplasm_RadialDistribution_MeanFrac_RFP_3of4,...,Nuclei_RadialDistribution_FracAtD_CY5_4of4,Cells_RadialDistribution_ZernikePhase_DAPI_9_5,Cytoplasm_AreaShape_Zernike_8_4,Cells_RadialDistribution_FracAtD_DAPI_3of4,Cytoplasm_Correlation_RWC_DAPI_CY5,Nuclei_RadialDistribution_ZernikeMagnitude_CY5_9_7,Cytoplasm_RadialDistribution_ZernikePhase_CY5_7_1,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Cytoplasm_AreaShape_Zernike_4_0,Metadata_datasplit
0,-0.725787,-0.051563,0.077541,0.38828,0.937687,0.692581,-0.189219,-0.541059,-0.438668,0.175209,...,0.714301,-0.565112,0.028403,0.32353,0.009417,2.637656,0.314585,-0.800461,4.510781,shuffled_rest
1,-0.030691,-1.252319,0.918384,-0.486414,-1.248994,1.091782,-0.789636,1.09082,-0.161649,0.235242,...,-0.440515,-1.148477,-0.428054,-0.313825,0.577278,-0.730832,-0.925885,-0.018684,-0.439678,shuffled_rest
2,-0.411171,2.454155,-0.317497,6.920159,-0.195333,0.080899,-0.420834,-1.126813,-0.229401,0.445625,...,0.058922,-1.537658,1.752856,-0.273968,-0.015247,-0.499259,1.441778,0.181854,0.819933,shuffled_rest
3,-0.50258,1.147165,0.140886,0.76743,1.542222,-1.484568,-0.207952,-2.55836,-0.136,-0.559237,...,-0.02933,-1.496945,0.316757,-2.938778,-1.145859,-0.196487,0.62501,0.891056,-1.14355,shuffled_rest
4,0.521748,0.413192,1.381932,0.349153,-0.284557,0.43864,-0.678318,-1.243163,-0.248275,0.548753,...,-0.937562,1.358233,0.70031,-1.04063,-0.95812,-0.29388,-0.909364,-0.130199,-0.954818,shuffled_rest


In [8]:
# Decode the true genotypes using the encoder
true_genotypes = le.fit_transform(model_shuffled_df["Metadata_genotype"])

shuffled_probabilitydf = pd.DataFrame(
    {
        f"probability_{le.inverse_transform([1])[0]}": model.predict_proba(model_shuffled_df[model.feature_names_in_])[:, 1],
        "predicted_genotype": model.predict(model_shuffled_df[model.feature_names_in_]),
        "true_genotype": true_genotypes
    }
)

shuffled_probabilitydf = pd.concat([shuffled_probabilitydf, model_shuffled_df[meta_cols].reset_index(drop=True)], axis=1)

# Rename 'Metadata_datasplit' to 'datasplit'
shuffled_probabilitydf.rename(columns={'Metadata_datasplit': 'datasplit'}, inplace=True)

# Move 'datasplit' to the start of the DataFrame
cols = ['datasplit'] + [col for col in shuffled_probabilitydf if col != 'datasplit']
shuffled_probabilitydf = shuffled_probabilitydf[cols]

print(shuffled_probabilitydf.shape)
shuffled_probabilitydf.head()

(21370, 21)


Unnamed: 0,datasplit,probability_WT,predicted_genotype,true_genotype,Metadata_Cells_Number_Object_Number,Metadata_ImageNumber,Metadata_Cells_Location_Center_Y,Metadata_Nuclei_Number_Object_Number,Metadata_WellRow,Metadata_Cytoplasm_Parent_Nuclei,...,Metadata_number_of_singlecells,Metadata_WellCol,Metadata_Plate,Metadata_Nuclei_Location_Center_Y,Metadata_Well,Metadata_Cells_Location_Center_X,Metadata_Nuclei_Location_Center_X,Metadata_Site,Metadata_gene_name,Metadata_Cytoplasm_Parent_Cells
0,shuffled_rest,0.865907,1,0,2.0,1177.0,317.145584,3.0,F,3.0,...,149,9,Plate_5,356.292837,F9,875.270222,902.972299,14,NF1,2.0
1,shuffled_rest,0.006202,0,0,5.0,32.0,318.523005,6.0,B,6.0,...,148,11,Plate_5,311.350248,B11,606.513594,615.165938,1,NF1,5.0
2,shuffled_rest,0.30334,0,0,1.0,290.0,564.884298,1.0,C,1.0,...,119,12,Plate_5,556.116579,C12,345.700749,394.21324,17,NF1,1.0
3,shuffled_rest,0.039801,0,0,9.0,1016.0,713.809386,9.0,F,9.0,...,125,12,Plate_5,700.850537,F12,276.998649,288.7141,6,NF1,9.0
4,shuffled_rest,0.057411,0,0,7.0,534.0,551.984447,9.0,D,9.0,...,106,12,Plate_5,582.018333,D12,924.846837,947.446667,16,NF1,7.0


In [9]:
# Concatenate the DataFrames vertically and save to a Parquet file
combined_df = pd.concat([shuffled_probabilitydf, probabilitydf], axis=0).reset_index(drop=True)
combined_df.to_parquet(f"./data/nf1_eval_data.parquet")

combined_df

Unnamed: 0,datasplit,probability_WT,predicted_genotype,true_genotype,Metadata_Cells_Number_Object_Number,Metadata_ImageNumber,Metadata_Cells_Location_Center_Y,Metadata_Nuclei_Number_Object_Number,Metadata_WellRow,Metadata_Cytoplasm_Parent_Nuclei,...,Metadata_number_of_singlecells,Metadata_WellCol,Metadata_Plate,Metadata_Nuclei_Location_Center_Y,Metadata_Well,Metadata_Cells_Location_Center_X,Metadata_Nuclei_Location_Center_X,Metadata_Site,Metadata_gene_name,Metadata_Cytoplasm_Parent_Cells
0,shuffled_rest,0.865907,1,0,2.0,1177.0,317.145584,3.0,F,3.0,...,149,9,Plate_5,356.292837,F9,875.270222,902.972299,14,NF1,2.0
1,shuffled_rest,0.006202,0,0,5.0,32.0,318.523005,6.0,B,6.0,...,148,11,Plate_5,311.350248,B11,606.513594,615.165938,1,NF1,5.0
2,shuffled_rest,0.303340,0,0,1.0,290.0,564.884298,1.0,C,1.0,...,119,12,Plate_5,556.116579,C12,345.700749,394.213240,17,NF1,1.0
3,shuffled_rest,0.039801,0,0,9.0,1016.0,713.809386,9.0,F,9.0,...,125,12,Plate_5,700.850537,F12,276.998649,288.714100,6,NF1,9.0
4,shuffled_rest,0.057411,0,0,7.0,534.0,551.984447,9.0,D,9.0,...,106,12,Plate_5,582.018333,D12,924.846837,947.446667,16,NF1,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42735,test,0.544581,1,1,3.0,288.0,358.720822,4.0,C,4.0,...,95,11,Plate_4,376.879008,C11,708.548012,699.588793,9,NF1,3.0
42736,test,0.975218,1,1,3.0,270.0,669.784779,5.0,C,5.0,...,95,11,Plate_4,647.613644,C11,160.533679,137.917724,16,NF1,3.0
42737,test,0.347940,0,1,3.0,265.0,251.700852,5.0,C,5.0,...,95,11,Plate_4,221.622005,C11,563.757778,543.670216,11,NF1,3.0
42738,test,0.957261,1,1,2.0,277.0,226.483731,2.0,C,2.0,...,95,11,Plate_4,279.212354,C11,941.952479,898.655336,22,NF1,2.0
