In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_mean = pd.read_csv('../data/tomato_mean.csv')
raw_stderr = pd.read_csv('../data/tomato_stderr.csv')

In [3]:
drop_cols = ['Variety','panel number','panel date','measure', *raw_stderr.columns[raw_stderr.isna().any()]]
df_mean = raw_mean.drop(columns=drop_cols)
df_mean = df_mean.replace('-', np.nan).astype(float)
df_mean = df_mean.fillna(df_mean.mean())

df_stderr = raw_stderr.drop(columns=drop_cols)
df_stderr = df_stderr.replace('-', np.nan).astype(float)
df_stderr = df_stderr.fillna(df_stderr.mean())

drop_cols

['Variety', 'panel number', 'panel date', 'measure', 'Bitter']

In [4]:
def normalize_column(col):
    return (col - col.mean()) / col.std()

In [5]:
NUM_LABEL_COLS = 7

In [6]:
val_dataset = df_mean.copy()
val_dataset[val_dataset.columns[:NUM_LABEL_COLS]] = val_dataset[val_dataset.columns[:NUM_LABEL_COLS]] / 100
val_dataset[val_dataset.columns[NUM_LABEL_COLS:]] = val_dataset[val_dataset.columns[NUM_LABEL_COLS:]].apply(normalize_column, axis=0)

# val_dataset.to_csv('../data/tomato_val_dataset.csv')

In [7]:
val_dataset

Unnamed: 0,Overall Liking,Texture liking,Sweetness,Sourness,Salty,Umami,Tomato Flavor Intenstity,glucose,fructose,Soluble solids,...,citric:malic,3-methyl-1-pentanol,2-ethylfuran,isopentyl acetate,cis-3-hexenyl acetate,benzothiazole,benzyl alcohol,citric acid,3-methyl-2-butenal,p-anisaldehyde
0,0.3367,0.3406,0.2551,0.1820,0.1294,0.1286,0.3341,2.280922,1.913190,2.562857,...,2.510053,1.266061,-0.092380,-1.007032,1.312008,1.039829,0.453284,0.923551,-0.017934,-0.340967
1,0.2878,0.2747,0.2556,0.1653,0.1256,0.1667,0.3561,0.738719,0.686613,0.719634,...,-0.582452,1.458910,-0.868766,2.501611,0.512250,1.471306,-0.547379,-1.264632,0.926590,1.559112
2,0.2701,0.2099,0.2009,0.1710,0.1111,0.1352,0.3089,0.027918,-0.164139,0.099564,...,-0.663188,0.968021,1.149837,0.293926,-0.335493,-0.055458,-0.070873,-0.613602,0.526984,-0.229198
3,0.2649,0.2119,0.2505,0.1673,0.1153,0.0896,0.3026,2.796207,3.405277,2.928103,...,-0.247226,-0.609840,-0.247657,1.713152,-0.303503,-0.088648,-0.428252,1.945306,0.853935,-0.452737
4,0.2575,0.2325,0.2440,0.1290,0.1040,0.1019,0.3024,1.215633,1.071788,1.246269,...,-0.508738,-0.013759,-0.713489,-0.928186,-1.119256,0.309638,-0.213824,-0.215751,0.236361,-0.452737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,0.0532,0.1300,0.0685,0.0689,0.0647,0.0877,0.1470,-1.159797,-0.771818,-1.004671,...,-1.108986,-0.925412,1.305114,-0.533956,-0.943309,0.475591,-0.404427,-1.978956,-0.381213,1.670882
75,0.0473,0.0923,0.0831,0.0711,0.0615,0.0812,0.1462,-0.682884,-0.420299,-0.945212,...,-0.961556,-0.557244,-1.179320,-0.218573,0.688197,-0.088648,-0.785631,-1.165169,-0.272229,-0.564506
76,0.0453,0.0643,0.1329,0.1183,0.0762,0.0908,0.2257,0.601675,0.232255,1.823869,...,3.649120,-0.820221,-1.645152,-0.888763,-1.631101,-0.254601,2.478433,1.556496,1.471508,0.106110
77,0.0434,0.0631,0.0649,0.2046,0.1077,0.0881,0.2468,-1.695183,-1.502903,-1.004671,...,1.388537,1.669292,-0.713489,4.196797,-0.143551,-0.652887,0.238856,2.768135,0.236361,-0.340967


In [18]:
df_samples = pd.DataFrame()

# For each row in the original DataFrames
for idx in range(len(df_mean)):
    # Get mean and standard error for the current row
    mean_row = df_mean.iloc[idx]
    stderr_row = df_stderr.iloc[idx]

    # Generate 10 samples from the normal distribution with the mean and standard error
    samples = pd.DataFrame({column: np.random.normal(mean, np.divide(stderr, mean, out=np.zeros_like(mean), where=mean!=0), 10) for column, mean, stderr in zip(df_mean.columns, mean_row, stderr_row)})

    # Append the samples to the new DataFrame
    df_samples = pd.concat([df_samples, samples], ignore_index=True)

df_samples

Unnamed: 0,Overall Liking,Texture liking,Sweetness,Sourness,Salty,Umami,Tomato Flavor Intenstity,glucose,fructose,Soluble solids,...,citric:malic,3-methyl-1-pentanol,2-ethylfuran,isopentyl acetate,cis-3-hexenyl acetate,benzothiazole,benzyl alcohol,citric acid,3-methyl-2-butenal,p-anisaldehyde
0,33.811720,34.054781,25.568988,17.988985,12.872541,12.653872,33.415926,26.846440,27.788920,8.134601,...,22.756256,1.297807,-0.093780,0.000000,2.337792,0.184461,0.239134,4.698115,0.329048,0.183392
1,33.855798,34.199163,25.252777,18.190534,13.002209,13.076103,33.467872,26.815412,27.771902,8.098606,...,22.777979,1.366769,0.172402,0.000000,2.319625,0.107618,0.486699,4.737961,0.326578,0.095124
2,33.708669,34.044921,25.401591,18.093449,12.821958,12.972606,33.589041,26.813716,27.788863,8.093020,...,22.747706,1.405127,0.121547,0.000000,2.336737,0.078946,0.790589,4.745674,0.364361,0.070460
3,33.598928,34.092697,25.292846,17.994763,12.848982,12.880868,33.376425,26.830090,27.829449,8.063595,...,22.764312,1.363286,0.115820,0.000000,2.392278,-0.043075,0.394255,4.678965,0.335862,-0.131942
4,33.756860,34.152450,25.561378,18.274079,12.944118,12.983029,33.495457,26.897233,27.792402,8.114532,...,22.744572,1.578967,0.075599,0.000000,2.372444,0.193612,0.144619,4.700358,0.354502,-0.073272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,3.845226,1.569633,13.518098,8.458367,8.483430,9.623312,19.203961,7.415612,8.644307,4.909008,...,2.890911,0.401331,-0.103307,0.148105,0.164671,0.116677,0.495337,2.543774,0.286741,0.405011
786,2.624020,2.404939,13.542159,9.006776,8.117069,9.533624,19.189404,7.447700,8.627883,4.958613,...,2.559575,0.183710,-0.030087,-0.978217,1.022793,0.210065,0.902702,2.696364,0.295951,-0.036386
787,2.923640,-1.655567,13.463333,9.135723,8.295989,9.502517,18.988588,7.314592,8.602654,5.010942,...,2.569521,0.736729,0.233650,0.355027,0.633134,-0.215248,0.669395,2.618996,0.180242,0.218354
788,3.720795,3.764131,13.351551,8.808030,8.169610,9.663344,19.206164,7.353800,8.645291,4.988047,...,2.571207,0.892085,-0.317929,0.781038,0.649589,-0.024270,0.655507,2.601596,0.283013,0.327278


In [19]:
sample_dataset = df_samples.copy()
sample_dataset[sample_dataset.columns[:NUM_LABEL_COLS]] = sample_dataset[sample_dataset.columns[:NUM_LABEL_COLS]].clip(0) / 100
sample_dataset[sample_dataset.columns[NUM_LABEL_COLS:]] = sample_dataset[sample_dataset.columns[NUM_LABEL_COLS:]].apply(normalize_column, axis=0)

# sample_dataset.to_csv('../data/tomato_samples_small.csv')
sample_dataset

Unnamed: 0,Overall Liking,Texture liking,Sweetness,Sourness,Salty,Umami,Tomato Flavor Intenstity,glucose,fructose,Soluble solids,...,citric:malic,3-methyl-1-pentanol,2-ethylfuran,isopentyl acetate,cis-3-hexenyl acetate,benzothiazole,benzyl alcohol,citric acid,3-methyl-2-butenal,p-anisaldehyde
0,0.338117,0.340548,0.255690,0.179890,0.128725,0.126539,0.334159,2.295488,1.922709,2.609377,...,2.525383,1.004298,-0.955278,-0.553271,1.215329,0.406991,-0.317549,0.925334,-0.033387,0.608113
1,0.338558,0.341992,0.252528,0.181905,0.130022,0.130761,0.334679,2.289785,1.919508,2.578578,...,2.529219,1.112119,0.359303,-0.553271,1.187403,0.109232,0.152748,0.961596,-0.040644,0.294032
2,0.337087,0.340449,0.254016,0.180934,0.128220,0.129726,0.335890,2.289473,1.922699,2.573799,...,2.523873,1.172092,0.108147,-0.553271,1.213707,-0.001868,0.730046,0.968616,0.070380,0.206275
3,0.335989,0.340927,0.252928,0.179948,0.128490,0.128809,0.333764,2.292482,1.930334,2.548622,...,2.526806,1.106673,0.079866,-0.553271,1.299086,-0.474686,-0.022868,0.907907,-0.013365,-0.513916
4,0.337569,0.341524,0.255614,0.182741,0.129441,0.129830,0.334955,2.304824,1.923364,2.592205,...,2.523320,1.443889,-0.118773,-0.553271,1.268597,0.442448,-0.497099,0.927376,0.041410,-0.305158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,0.038452,0.015696,0.135181,0.084584,0.084834,0.096233,0.192040,-1.276013,-1.679005,-0.150519,...,-0.982699,-0.397338,-1.002327,-0.221668,-2.125245,0.144334,0.169158,-1.035213,-0.157703,1.396681
786,0.026240,0.024049,0.135422,0.090068,0.081171,0.095336,0.191894,-1.270116,-1.682095,-0.108075,...,-1.041210,-0.737587,-0.640720,-2.743479,-0.806120,0.506204,0.943026,-0.896349,-0.130640,-0.173906
787,0.029236,0.000000,0.134633,0.091357,0.082960,0.095025,0.189886,-1.294581,-1.686842,-0.063301,...,-1.039454,0.127056,0.661783,0.241626,-1.405112,-1.141838,0.499814,-0.966758,-0.470647,0.732512
788,0.037208,0.037641,0.133516,0.088080,0.081696,0.096633,0.192062,-1.287375,-1.678820,-0.082891,...,-1.039156,0.369953,-2.062269,1.195457,-1.379818,-0.401819,0.473431,-0.982592,-0.168658,1.120090


In [2]:
val_dataset = pd.read_csv('../data/tomato_val_dataset.csv', index_col=0)
sample_dataset = pd.read_csv('../data/tomato_samples_small.csv', index_col=0)

In [3]:
sample_dataset

Unnamed: 0,Overall Liking,Texture liking,Sweetness,Sourness,Salty,Umami,Tomato Flavor Intenstity,glucose,fructose,Soluble solids,...,citric:malic,3-methyl-1-pentanol,2-ethylfuran,isopentyl acetate,cis-3-hexenyl acetate,benzothiazole,benzyl alcohol,citric acid,3-methyl-2-butenal,p-anisaldehyde
0,0.338117,0.340548,0.255690,0.179890,0.128725,0.126539,0.334159,2.295488,1.922709,2.609377,...,2.525383,1.004298,-0.955278,-0.553271,1.215329,0.406991,-0.317549,0.925334,-0.033387,0.608113
1,0.338558,0.341992,0.252528,0.181905,0.130022,0.130761,0.334679,2.289785,1.919508,2.578578,...,2.529219,1.112119,0.359303,-0.553271,1.187403,0.109232,0.152748,0.961596,-0.040644,0.294032
2,0.337087,0.340449,0.254016,0.180934,0.128220,0.129726,0.335890,2.289473,1.922699,2.573799,...,2.523873,1.172092,0.108147,-0.553271,1.213707,-0.001868,0.730046,0.968616,0.070380,0.206275
3,0.335989,0.340927,0.252928,0.179948,0.128490,0.128809,0.333764,2.292482,1.930334,2.548622,...,2.526806,1.106673,0.079866,-0.553271,1.299086,-0.474686,-0.022868,0.907907,-0.013365,-0.513916
4,0.337569,0.341524,0.255614,0.182741,0.129441,0.129830,0.334955,2.304824,1.923364,2.592205,...,2.523320,1.443889,-0.118773,-0.553271,1.268597,0.442448,-0.497099,0.927376,0.041410,-0.305158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,0.038452,0.015696,0.135181,0.084584,0.084834,0.096233,0.192040,-1.276013,-1.679005,-0.150519,...,-0.982699,-0.397338,-1.002327,-0.221668,-2.125245,0.144334,0.169158,-1.035213,-0.157703,1.396681
786,0.026240,0.024049,0.135422,0.090068,0.081171,0.095336,0.191894,-1.270116,-1.682095,-0.108075,...,-1.041210,-0.737587,-0.640720,-2.743479,-0.806120,0.506204,0.943026,-0.896349,-0.130640,-0.173906
787,0.029236,0.000000,0.134633,0.091357,0.082960,0.095025,0.189886,-1.294581,-1.686842,-0.063301,...,-1.039454,0.127056,0.661783,0.241626,-1.405112,-1.141838,0.499814,-0.966758,-0.470647,0.732512
788,0.037208,0.037641,0.133516,0.088080,0.081696,0.096633,0.192062,-1.287375,-1.678820,-0.082891,...,-1.039156,0.369953,-2.062269,1.195457,-1.379818,-0.401819,0.473431,-0.982592,-0.168658,1.120090


In [4]:
avg_1_seed = 2
avg_75_seed = 67
avg_50_seed = 3
avg_25_seed = 54
avg_167_seed = 0
avg_125_seed = 79

In [5]:
avg_1_indices = val_dataset.sample(frac=1, random_state=avg_1_seed).index
avg_75_indices = val_dataset.sample(frac=3/4, random_state=avg_75_seed).index
avg_50_indices = val_dataset.sample(frac=2/4, random_state=avg_50_seed).index
avg_25_indices = val_dataset.sample(frac=1/4, random_state=avg_25_seed).index
avg_167_indices = val_dataset.sample(frac=1/6, random_state=avg_167_seed).index
avg_125_indices = val_dataset.sample(frac=1/8, random_state=avg_125_seed).index

In [6]:
print(avg_1_indices)
print(avg_75_indices)
print(avg_50_indices)
print(avg_25_indices)
print(avg_167_indices)
print(avg_125_indices)

Index([25, 30, 29, 48, 73, 56, 41, 52, 23, 14, 21, 36, 57, 13, 66, 54, 44,  1,
       28, 67, 61, 55, 35, 70, 27, 10, 45, 59, 18, 76,  0, 11, 32, 71, 62,  9,
       75, 17, 78, 58,  8, 60, 77, 26, 53, 12, 16,  2, 50, 19, 68,  6, 46, 64,
       24,  5,  3, 33, 69, 38, 65, 51, 74, 42,  4, 39, 37, 20, 31, 63, 47, 49,
       34,  7, 43, 22, 72, 15, 40],
      dtype='int64')
Index([ 0, 20, 21, 64,  4, 10, 11, 29, 38, 65, 17,  2, 15, 73, 44, 40,  8, 60,
       25, 30, 59, 24, 14,  6,  1,  5,  3, 37, 47, 62, 51, 50, 43, 16, 61, 13,
       48, 68, 32, 76, 18, 12, 56, 36, 49, 52, 57, 28, 27, 71, 78, 66, 41, 34,
       39, 63, 19, 26, 58],
      dtype='int64')
Index([45, 48, 25,  6, 52,  8, 66, 75,  9, 57, 12, 54, 32, 61, 68, 53, 13, 59,
       27, 71, 34, 49, 35, 47, 16, 77,  4, 11, 70, 23, 31, 65, 63, 55, 18, 36,
       74, 28, 64, 69],
      dtype='int64')
Index([47, 71, 46, 60, 45, 59, 78, 36, 10, 33, 0, 29, 21, 37, 74, 63, 4, 22,
       28, 53],
      dtype='int64')
Index([49, 27, 30, 63, 4

In [7]:
NUM_SAMPLES = len(sample_dataset)//len(val_dataset)
avg_1_sample_indices = np.array([list(range(i*NUM_SAMPLES, (i+1)*NUM_SAMPLES)) for i in avg_1_indices]).flatten()
avg_75_sample_indices = np.array([list(range(i*NUM_SAMPLES, (i+1)*NUM_SAMPLES)) for i in avg_75_indices]).flatten()
avg_50_sample_indices = np.array([list(range(i*NUM_SAMPLES, (i+1)*NUM_SAMPLES)) for i in avg_50_indices]).flatten()
avg_25_sample_indices = np.array([list(range(i*NUM_SAMPLES, (i+1)*NUM_SAMPLES)) for i in avg_25_indices]).flatten()
avg_167_sample_indices = np.array([list(range(i*NUM_SAMPLES, (i+1)*NUM_SAMPLES)) for i in avg_167_indices]).flatten()
avg_125_sample_indices = np.array([list(range(i*NUM_SAMPLES, (i+1)*NUM_SAMPLES)) for i in avg_125_indices]).flatten()

In [8]:
val_dataset.loc[avg_1_indices].to_csv('../data/tomato_1_val_dataset.csv')
sample_dataset.loc[avg_1_sample_indices].to_csv('../data/tomato_1_samples_small.csv')
val_dataset.loc[avg_75_indices].to_csv('../data/tomato_75_val_dataset.csv')
sample_dataset.loc[avg_75_sample_indices].to_csv('../data/tomato_75_samples_small.csv')
val_dataset.loc[avg_50_indices].to_csv('../data/tomato_50_val_dataset.csv')
sample_dataset.loc[avg_50_sample_indices].to_csv('../data/tomato_50_samples_small.csv')
val_dataset.loc[avg_25_indices].to_csv('../data/tomato_25_val_dataset.csv')
sample_dataset.loc[avg_25_sample_indices].to_csv('../data/tomato_25_samples_small.csv')
val_dataset.loc[avg_167_indices].to_csv('../data/tomato_167_val_dataset.csv')
sample_dataset.loc[avg_167_sample_indices].to_csv('../data/tomato_167_samples_small.csv')
val_dataset.loc[avg_125_indices].to_csv('../data/tomato_125_val_dataset.csv')
sample_dataset.loc[avg_125_sample_indices].to_csv('../data/tomato_125_samples_small.csv')