# Normalising the Data
In this notebook we perform the differential scaling. This means depending on the feature type we scale the values differently. Some values are not scaled (dimension reduced versions of features), most are standard scaled regularly, but the intensity features are scaled on a per plate basis.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import organoid_prediction_python as opp

home_directory = r"C:\Users\savill\OneDrive\Documents\PhD Jesse\Embryonic_organoid_prediction\Processing dataset for Paper\TLS_2D_morphostate_investigation/"
table_location = home_directory + "tabular_data"

data_complete = pd.read_csv(f"{table_location}/All_Samples_Feature_Space_All_Features_48h,72h,96h.csv").set_index(["Run","Plate","ID"])
data_complete

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 0,048h_raw_BF_AreaShape_area,048h_raw_BF_AreaShape_aspect_ratio,048h_raw_BF_AreaShape_axis_major_length,048h_raw_BF_AreaShape_axis_minor_length,048h_raw_BF_AreaShape_eccentricity,048h_raw_BF_AreaShape_feret_diameter_max,048h_raw_BF_AreaShape_perimeter,048h_raw_BF_AreaShape_solidity,048h_str_BF_AreaShape_area,...,096h_str_BF_AreaShape_moments_hu_1,096h_str_BF_AreaShape_moments_hu_2,096h_str_BF_AreaShape_moments_hu_3,096h_str_BF_AreaShape_moments_hu_4,096h_str_BF_AreaShape_moments_hu_5,096h_str_BF_AreaShape_moments_hu_6,096h_str_BF_AreaShape_orientation,096h_str_BF_AreaShape_perimeter,096h_str_CH_AreaShape_Bra_MajorAxis_Polarisation,096h_str_CH_AreaShape_Bra_MinorAxis_Polarisation
Run,Plate,ID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
RR,1,A1,0,51429.057934,1.036110,260.694837,251.609209,0.261703,266.724612,848.528995,0.985088,51519.761740,...,0.000012,0.000019,4.824832e-09,-1.412213e-15,-1.643041e-11,-3.074749e-16,0.052614,1338.362215,0.027743,0.018767
RR,1,A2,1,46107.767989,1.018480,244.753205,240.312256,0.189631,252.890770,807.094108,0.984030,46032.181484,...,0.000523,0.000235,1.227808e-06,2.059185e-12,8.695410e-10,2.075727e-11,-0.311135,1265.903481,0.013417,0.031259
RR,1,A3,2,54482.752732,1.070369,272.945953,255.001802,0.356599,280.334245,885.017919,0.983893,54898.478509,...,0.000140,0.000034,6.502767e-08,-8.891570e-14,-7.558533e-10,-3.658872e-14,-0.151310,1293.544459,0.041057,0.014838
RR,1,A4,3,46999.688747,1.054378,251.337713,238.375438,0.316996,256.319703,817.033443,0.985420,46886.308989,...,0.002951,0.000093,5.961453e-06,1.366487e-10,3.202327e-07,3.202046e-11,0.086875,1354.224446,0.053157,0.007887
RR,1,A5,4,50680.751535,1.067224,262.580886,246.040977,0.349302,267.361474,844.917216,0.985450,50348.170914,...,0.003706,0.000031,7.229710e-07,-1.487676e-12,-1.642923e-08,3.067576e-12,-0.076661,1188.613239,0.038555,0.006555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TR,2,H8,762,47634.615388,1.074280,255.443622,237.781139,0.365387,260.429973,822.255724,0.984534,46878.750339,...,0.001501,0.000359,4.130417e-06,7.295109e-11,4.233366e-08,1.412304e-10,-0.125910,1381.212217,0.010821,0.026928
TR,2,H9,763,45616.455707,1.141554,257.800200,225.832628,0.482314,262.626514,798.260048,0.984502,44943.735814,...,0.002145,0.000292,4.471516e-06,-6.053277e-11,-1.001351e-07,1.499027e-10,0.088982,1479.048220,0.035616,0.026081
TR,2,H10,764,42336.001394,1.017190,234.372899,230.412135,0.183066,242.141388,769.823637,0.985571,42358.677346,...,0.000897,0.000324,1.865052e-06,-2.186069e-11,-2.439511e-08,4.027224e-11,0.042767,1474.998259,0.009338,0.004076
TR,2,H11,765,40484.132024,1.034860,231.114030,223.328768,0.257365,237.301383,758.159345,0.983294,39803.853480,...,0.006455,0.000210,9.167153e-06,-3.484705e-10,-6.793602e-07,2.008960e-10,-0.080494,1449.277625,0.012343,0.042325


## Defining which features are sclaed by which method

In [2]:
no_scaling = ["UMAP_","PC_","Location"]
per_plate_scaling = ["RadialDistribution","Intensity"]

keys_no_scaling = [key for key in data_complete.keys() if any([word in key for word in no_scaling])]
keys_per_plate_scaling = [key for key in data_complete.keys() if any([word in key for word in per_plate_scaling]) and not any([word in key for word in no_scaling])]
keys_normal_scaling = [key for key in data_complete.keys() if "AreaShape" in key and not any([word in key for word in no_scaling + ["loco"]])]
keys_group_scaling = [key for key in data_complete.keys() if "loco" in key and not any([word in key for word in no_scaling])]

#keys_per_plate_scaling

['072h_raw_CH__bf_mask_Intensity_IntegratedIntensity',
 '072h_raw_CH__bf_mask_Intensity_IntegratedIntensityEdge',
 '072h_raw_CH__bf_mask_Intensity_LowerQuartileIntensity',
 '072h_raw_CH__bf_mask_Intensity_MADIntensity',
 '072h_raw_CH__bf_mask_Intensity_MassDisplacement',
 '072h_raw_CH__bf_mask_Intensity_MaxIntensity',
 '072h_raw_CH__bf_mask_Intensity_MaxIntensityEdge',
 '072h_raw_CH__bf_mask_Intensity_MeanIntensity',
 '072h_raw_CH__bf_mask_Intensity_MeanIntensityEdge',
 '072h_raw_CH__bf_mask_Intensity_MedianIntensity',
 '072h_raw_CH__bf_mask_Intensity_MinIntensity',
 '072h_raw_CH__bf_mask_Intensity_MinIntensityEdge',
 '072h_raw_CH__bf_mask_Intensity_StdIntensity',
 '072h_raw_CH__bf_mask_Intensity_StdIntensityEdge',
 '072h_raw_CH__bf_mask_Intensity_UpperQuartileIntensity',
 '072h_raw_CH__bf_mask_RadialDistribution_FracAtD_1of9',
 '072h_raw_CH__bf_mask_RadialDistribution_FracAtD_2of9',
 '072h_raw_CH__bf_mask_RadialDistribution_FracAtD_3of9',
 '072h_raw_CH__bf_mask_RadialDistribution_Frac

## Performing the differential standard scaling

In [3]:
diff_scaling_df = opp.differential_standard_scaling(data_complete,keys_normal_scaling,keys_per_plate_scaling,keys_no_scaling,keys_group_scaling)
diff_scaling_df = diff_scaling_df.reindex(sorted(diff_scaling_df.columns), axis=1)

## Saving the result

In [4]:
diff_scaling_df.to_csv(f"{table_location}/All_Samples_Feature_Space_All_Features_48h,72h,96h_Normalised_per_Timepoint.csv")

# Stacking time data
Here we are stacking all the features from different timepoints and adding an Hour index, so that we can scale across all timepoints

In [5]:
def stack_time_data(dataframe, hours = ["48","72","96"]):
    time_keys = [[key for key in dataframe.keys() if key[1:].startswith(str(hour))]+["Run","Plate","ID"] for hour in hours]
    dataframes_separated = [dataframe[key_list].rename(columns={key:key[5:] for key in key_list if key not in ["Run","Plate","ID"]}) for key_list in time_keys]
    
    out = []
    for frame,hour in zip(dataframes_separated,hours):
        frame["hour"] = np.full(len(frame),hour)
        out.append(frame)
        
    return pd.concat(out,axis=0,ignore_index=True)

data_complete_no_index = pd.read_csv(f"{table_location}/All_Samples_Feature_Space_All_Features_48h,72h,96h.csv")
out = stack_time_data(data_complete_no_index)
out

Unnamed: 0,raw_BF_AreaShape_area,raw_BF_AreaShape_aspect_ratio,raw_BF_AreaShape_axis_major_length,raw_BF_AreaShape_axis_minor_length,raw_BF_AreaShape_eccentricity,raw_BF_AreaShape_feret_diameter_max,raw_BF_AreaShape_perimeter,raw_BF_AreaShape_solidity,str_BF_AreaShape_area,str_BF_AreaShape_aspect_ratio,...,raw_CH_bra_mask_RadialDistribution_RadialCV_3of9,raw_CH_bra_mask_RadialDistribution_RadialCV_4of9,raw_CH_bra_mask_RadialDistribution_RadialCV_5of9,raw_CH_bra_mask_RadialDistribution_RadialCV_6of9,raw_CH_bra_mask_RadialDistribution_RadialCV_7of9,raw_CH_bra_mask_RadialDistribution_RadialCV_8of9,raw_CH_bra_mask_RadialDistribution_RadialCV_9of9,str_CH_AreaShape_Bra_MajorAxis_Polarisation,str_CH_AreaShape_Bra_MinorAxis_Polarisation,raw_CH_AreaShape_Bra_AreaFraction
0,51429.057934,1.036110,260.694837,251.609209,0.261703,266.724612,848.528995,0.985088,51519.761740,1.053506,...,,,,,,,,,,
1,46107.767989,1.018480,244.753205,240.312256,0.189631,252.890770,807.094108,0.984030,46032.181484,1.017855,...,,,,,,,,,,
2,54482.752732,1.070369,272.945953,255.001802,0.356599,280.334245,885.017919,0.983893,54898.478509,1.089076,...,,,,,,,,,,
3,46999.688747,1.054378,251.337713,238.375438,0.316996,256.319703,817.033443,0.985420,46886.308989,1.057124,...,,,,,,,,,,
4,50680.751535,1.067224,262.580886,246.040977,0.349302,267.361474,844.917216,0.985450,50348.170914,1.057777,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2296,111134.838154,1.142154,405.046069,354.633456,0.483148,426.097154,1281.489393,0.983741,115224.068070,1.265925,...,0.059606,0.046989,0.045001,0.054103,0.057371,0.056742,0.073117,0.010821,0.026928,0.940896
2297,125088.106959,1.240381,449.765600,362.602919,0.591637,478.386100,1389.507526,0.973528,126962.652281,1.322848,...,0.036139,0.049559,0.084494,0.141751,0.151588,0.113339,0.089259,0.035616,0.026081,0.873950
2298,128799.404350,1.177665,444.001992,377.019027,0.528171,461.129001,1400.733636,0.970995,127408.612659,1.199837,...,0.028084,0.021513,0.035351,0.054787,0.073556,0.079270,0.073310,0.009338,0.004076,0.940728
2299,119313.297985,1.451500,473.874839,326.472498,0.724816,463.075495,1343.308177,0.971624,122858.305064,1.614141,...,0.067871,0.067214,0.058404,0.056358,0.053544,0.047907,0.031259,0.012343,0.042325,0.958505


## Performing the differential standard scaling for Stacked time data

In [6]:
diff_scaled_timepoints_combined = opp.differential_standard_scaling(
    out.set_index(["hour","Run","Plate","ID"]),
    list(set([key[5:] for key in keys_normal_scaling])),
    list(set([key[5:] for key in keys_per_plate_scaling])),
    list(set([key[5:] for key in keys_no_scaling if "UMAP_" not in key and "PC_" not in key])),
    list(set([key[5:] for key in keys_group_scaling])),
    ["Run","Plate"]
)
diff_scaled_timepoints_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,raw_CH__bf_mask_Location_Center_X,raw_CH_bra_mask_Location_CenterMassIntensity_X,raw_CH_bra_mask_Location_MaxIntensity_X,raw_CH_bra_mask_Location_MaxIntensity_Y,raw_CH__bf_mask_Location_CenterMassIntensity_Y,raw_CH_bra_mask_Location_CenterMassIntensity_Z,raw_CH_bra_mask_Location_Center_X,raw_CH_bra_mask_Location_MaxIntensity_Z,raw_CH__bf_mask_Location_CenterMassIntensity_Z,raw_CH__bf_mask_Location_Center_Z,...,str_BF_AreaShape_locoefa_coeff_18,str_BF_AreaShape_locoefa_coeff_21,str_BF_AreaShape_locoefa_coeff_41,str_BF_AreaShape_locoefa_coeff_17,str_BF_AreaShape_locoefa_coeff_48,str_BF_AreaShape_locoefa_coeff_6,str_BF_AreaShape_locoefa_coeff_7,str_BF_AreaShape_locoefa_coeff_10,str_BF_AreaShape_locoefa_coeff_4,str_BF_AreaShape_locoefa_coeff_40
hour,Run,Plate,ID,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
48,RR,1,A1,,,,,,,,,,,...,-0.146065,-0.160023,-0.150960,-0.146147,-0.160939,-0.114686,-0.140001,-0.144639,-0.101858,-0.158851
48,RR,1,A2,,,,,,,,,,,...,-0.159476,-0.159433,-0.161845,-0.149568,-0.160783,-0.131948,-0.118456,-0.151153,-0.091190,-0.163127
48,RR,1,A3,,,,,,,,,,,...,-0.141260,-0.163112,-0.158453,-0.155746,-0.163593,-0.131050,-0.066554,-0.111846,-0.018542,-0.162827
48,RR,1,A4,,,,,,,,,,,...,-0.151532,-0.157940,-0.162030,-0.150996,-0.157040,-0.156191,-0.152452,-0.151308,-0.069545,-0.157451
48,RR,1,A5,,,,,,,,,,,...,-0.160664,-0.149163,-0.160105,-0.154475,-0.161930,-0.121939,-0.143884,-0.123822,-0.122515,-0.161279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,TR,2,H8,369.320547,368.556524,376.0,318.0,315.886732,0.0,368.337863,0.0,0.0,0.0,...,-0.150126,-0.159621,-0.159815,-0.152999,-0.157729,-0.097210,-0.020202,-0.146748,-0.027682,-0.156898
96,TR,2,H9,516.973956,522.330168,523.0,230.0,228.593520,0.0,522.006084,0.0,0.0,0.0,...,-0.157065,-0.156696,-0.158631,-0.148775,-0.157499,0.106735,-0.107569,-0.112577,0.070618,-0.163104
96,TR,2,H10,402.688087,401.366129,396.0,289.0,303.792287,0.0,401.161697,0.0,0.0,0.0,...,-0.162742,-0.160090,-0.157026,-0.161268,-0.159789,-0.006373,0.024936,-0.130398,0.197772,-0.158337
96,TR,2,H11,421.072347,421.649370,427.0,293.0,292.885256,0.0,420.962723,0.0,0.0,0.0,...,-0.149931,-0.161148,-0.160638,-0.156348,-0.163081,-0.104537,-0.106814,-0.135554,0.018819,-0.159104


In [8]:
diff_scaled_timepoints_combined = diff_scaled_timepoints_combined.reindex(sorted(diff_scaled_timepoints_combined.columns), axis=1)
diff_scaled_timepoints_combined.to_csv(f"{table_location}/All_Samples_Feature_Space_All_Features_48h,72h,96h_Normalised_across_Timepoints.csv")
diff_scaled_timepoints_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,raw_BF_AreaShape_area,raw_BF_AreaShape_aspect_ratio,raw_BF_AreaShape_axis_major_length,raw_BF_AreaShape_axis_minor_length,raw_BF_AreaShape_eccentricity,raw_BF_AreaShape_feret_diameter_max,raw_BF_AreaShape_perimeter,raw_BF_AreaShape_solidity,raw_CH_AreaShape_Bra_AreaFraction,raw_CH__bf_mask_Intensity_IntegratedIntensity,...,str_BF_AreaShape_moments_hu_1,str_BF_AreaShape_moments_hu_2,str_BF_AreaShape_moments_hu_3,str_BF_AreaShape_moments_hu_4,str_BF_AreaShape_moments_hu_5,str_BF_AreaShape_moments_hu_6,str_BF_AreaShape_orientation,str_BF_AreaShape_perimeter,str_CH_AreaShape_Bra_MajorAxis_Polarisation,str_CH_AreaShape_Bra_MinorAxis_Polarisation
hour,Run,Plate,ID,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
48,RR,1,A1,-0.786269,-0.602748,-0.804054,-0.680943,-0.839800,-0.833971,-0.802268,0.010873,,,...,-0.357708,0.487340,-0.094237,-0.043775,-0.058786,0.001758,-0.333690,-0.713208,,
48,RR,1,A2,-1.021382,-0.728751,-1.052827,-0.975465,-1.307770,-1.046449,-1.048576,-0.231227,,,...,-0.393168,-0.564204,-0.104432,-0.043775,-0.058574,-0.000295,-0.132991,-1.148552,,
48,RR,1,A3,-0.651346,-0.357900,-0.612872,-0.592495,-0.223631,-0.624938,-0.585361,-0.262510,,,...,-0.290376,0.319163,-0.095871,-0.043818,-0.059593,0.000651,0.201232,-0.410262,,
48,RR,1,A4,-0.981974,-0.472190,-0.950074,-1.025959,-0.480778,-0.993783,-0.989492,0.086936,,,...,-0.352930,-0.602993,-0.105103,-0.043775,-0.058580,-0.000286,-1.135101,-1.097838,,
48,RR,1,A5,-0.819332,-0.380374,-0.774622,-0.826112,-0.271012,-0.824190,-0.823738,0.093749,,,...,-0.351888,-0.566414,-0.104144,-0.043776,-0.058623,-0.000303,0.776286,-0.691755,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,TR,2,H8,1.851740,0.155152,1.448585,2.004995,0.598064,1.613871,1.771448,-0.297183,1.217114,1.417044,...,0.458998,2.582357,0.188006,-0.035680,-0.034970,0.419129,-0.277153,1.913029,-0.584849,1.261430
96,TR,2,H9,2.468244,0.857179,2.146445,2.212767,1.302495,2.416990,2.413557,-2.634542,0.871934,1.557836,...,0.826635,1.986519,0.212227,-0.050492,-0.114470,0.444883,0.218896,2.452851,0.748758,1.188526
96,TR,2,H10,2.632222,0.408947,2.056502,2.588608,0.890400,2.151934,2.480291,-3.214064,1.216245,1.983962,...,0.114522,2.268364,0.027144,-0.046201,-0.072206,0.119311,0.112216,2.430505,-0.664662,-0.703838
96,TR,2,H11,2.213093,2.366057,2.522676,1.270813,2.167236,2.181831,2.138927,-3.070283,1.307905,1.607404,...,3.287134,1.246410,0.545660,-0.082443,-0.437687,0.596319,-0.172317,2.288588,-0.503000,2.585532
