In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_mean = pd.read_csv('../data/strawberry_mean.csv')
raw_stderr = pd.read_csv('../data/strawberry_stderr.csv')

In [3]:
drop_cols = ['CULTIVAR','SEASON','HARVEST WEEK','HARVEST DATE','MEASURE', *raw_stderr.columns[raw_stderr.isna().any()]]
df_mean = raw_mean.drop(columns=drop_cols)
df_mean = df_mean.replace('-', np.nan).astype(float)
df_mean = df_mean.fillna(df_mean.mean())

df_stderr = raw_stderr.drop(columns=drop_cols)
df_stderr = df_stderr.replace('-', np.nan).astype(float)
df_stderr = df_stderr.fillna(df_stderr.mean())

drop_cols

['CULTIVAR',
 'SEASON',
 'HARVEST WEEK',
 'HARVEST DATE',
 'MEASURE',
 'L* ext',
 'A* ext',
 'B* ext',
 'FORCE',
 'L* int',
 'A* int',
 'B* int',
 'TOTAL SUGAR',
 'TOTAL VOLATILES']

In [4]:
def normalize_column(col):
    return (col - col.mean()) / col.std()

In [5]:
NUM_LABEL_COLS = 5

In [6]:
val_dataset = df_mean.copy()
val_dataset[val_dataset.columns[:NUM_LABEL_COLS]] = val_dataset[val_dataset.columns[:NUM_LABEL_COLS]] / 100
val_dataset[val_dataset.columns[NUM_LABEL_COLS:]] = val_dataset[val_dataset.columns[NUM_LABEL_COLS:]].apply(normalize_column, axis=0)

val_dataset
val_dataset.to_csv('../data/strawberry_val_dataset.csv')

In [7]:
val_dataset

Unnamed: 0,OVERALL LIKING,TEXTURE LIKING,SWEETNESS INTENSITY,SOURNESS INTENSITY,STRAWBERRY FLAVOR INTENSITY,6915-15-7,77-92-9,50-99-7,57-48-7,57-50-1,...,7786-58-5,15111-96-3,706-14-9,10522-34-6,5881-17-4,128-37-0,40716-66-3,4887-30-3,5454-09-1,2305-05-7
0,0.3075,0.2489,0.2772,0.1468,0.3058,-2.100612,0.169595,0.752129,0.606371,0.31149,...,-0.270974,-0.311335,-0.540292,-0.629855,1.398873,-0.939352,0.118712,-0.127677,0.800451,-0.373964
1,0.2634,0.2639,0.2366,0.2124,0.3005,-0.026208,0.937301,-0.744587,-0.81432,0.657985,...,-0.186399,-0.555966,-0.536839,-0.492856,0.509617,-0.825455,-0.535319,-0.213905,0.175372,-0.192277
2,0.361,0.3571,0.3034,0.1787,0.2427,2.435934,-0.693226,0.815594,0.737432,0.162992,...,-0.38938,0.599706,-0.541278,0.347401,0.926456,-0.505457,0.019087,0.213848,1.308934,0.448095
3,0.2249,0.13,0.2758,0.1587,0.2861,-1.28636,0.631578,0.358117,0.525114,1.091104,...,-0.193799,2.92792,-0.537086,-0.648121,0.982034,-1.023419,-0.692525,-0.252091,-0.368738,-0.174406
4,0.282,0.3039,0.2457,0.1839,0.2827,0.652335,-0.136129,-2.481941,-2.672753,2.50802,...,-0.418629,-0.480046,0.873843,-0.410657,1.458421,-0.462068,2.081086,0.112838,2.497555,0.151737
5,0.3185,0.2461,0.3101,0.1727,0.3348,-1.053716,1.460429,0.49298,0.719083,1.906295,...,0.445098,-0.977744,-0.548676,0.210403,2.065815,-0.898675,-0.736524,-0.453495,-0.467519,-0.637558
6,0.2776,0.2773,0.2476,0.1907,0.295,1.137009,0.624784,-0.659967,-0.70423,1.706751,...,0.143446,1.333599,-0.529318,0.009472,1.843501,-0.521728,-0.050493,-0.019584,0.194805,0.415331
7,0.3108,0.2673,0.2935,0.1571,0.322,-0.433334,-0.19048,2.002918,1.751837,0.882279,...,0.100453,2.590498,3.320372,0.183003,2.60572,0.367756,5.426126,1.507267,3.394688,0.74892
8,0.2877,0.2763,0.251,0.1723,0.2809,1.137009,-0.890247,-0.345287,-0.646563,0.432145,...,-0.054954,-0.513788,-0.477285,0.466133,1.386963,-0.760371,-0.459972,-0.246548,-0.342828,3.483157
9,0.2605,0.256,0.2068,0.1946,0.254,0.768657,0.23074,-0.871517,-0.754033,1.350975,...,-0.422153,-0.218543,-0.536099,-0.78512,0.481827,-0.657321,0.698885,-0.521246,-0.605166,-0.032929


In [8]:
df_samples = pd.DataFrame()

# For each row in the original DataFrames
for idx in range(len(df_mean)):
    # Get mean and standard error for the current row
    mean_row = df_mean.iloc[idx]
    stderr_row = df_stderr.iloc[idx]

    # Generate 10 samples from the normal distribution with the mean and standard error
    samples = pd.DataFrame({column: np.random.normal(mean, np.divide(stderr, mean, out=np.zeros_like(mean), where=mean!=0), 10) for column, mean, stderr in zip(df_mean.columns, mean_row, stderr_row)})

    # Append the samples to the new DataFrame
    df_samples = pd.concat([df_samples, samples], ignore_index=True)

df_samples

Unnamed: 0,OVERALL LIKING,TEXTURE LIKING,SWEETNESS INTENSITY,SOURNESS INTENSITY,STRAWBERRY FLAVOR INTENSITY,6915-15-7,77-92-9,50-99-7,57-48-7,57-50-1,...,7786-58-5,15111-96-3,706-14-9,10522-34-6,5881-17-4,128-37-0,40716-66-3,4887-30-3,5454-09-1,2305-05-7
0,30.842778,24.839807,27.689229,14.634752,30.487355,103.991881,765.989844,1879.000562,1998.001068,1314.027131,...,4.490021,0.882196,0.688871,0.447879,9.605760,0.875115,96.875792,12.753138,8.661265,4.225269
1,30.674880,24.933804,27.685665,14.557226,30.513502,104.021390,766.015023,1878.990928,1998.006144,1314.009203,...,4.616072,0.886733,0.739118,0.372473,9.612883,0.824222,97.011295,12.831551,8.677959,4.089280
2,30.711384,24.818647,27.825618,14.450792,30.620601,104.016744,765.978494,1878.992599,1998.000255,1313.952031,...,4.538090,0.758440,0.602739,0.420953,9.494749,0.518599,97.045194,12.727602,8.717469,4.157612
3,30.839922,25.100124,27.732406,14.774104,30.561021,104.026261,765.971027,1878.991128,1997.996912,1314.014246,...,4.470551,0.920688,0.754966,0.678186,9.689989,0.439734,97.331501,12.823799,8.734769,4.230377
4,30.779551,24.922735,27.669267,14.513505,30.612008,104.010329,765.979188,1878.998083,1997.998869,1313.988656,...,4.327633,0.753935,0.761718,0.418251,9.780806,0.666861,96.941363,12.853178,8.627306,4.209989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,23.778791,24.925079,20.507886,22.682926,25.704756,201.010326,869.995502,1282.000575,1462.998835,712.002001,...,1.056260,1.257243,32.328356,0.414739,5.574023,2.238166,89.572782,2.027212,0.691682,3.599580
536,23.655239,25.114082,20.468533,22.652633,25.761690,200.982989,869.998777,1282.018810,1463.001291,711.998135,...,1.219998,0.990730,31.664478,0.182975,5.276476,3.613700,90.319423,1.948654,0.976626,3.717078
537,23.675711,25.023030,20.281045,22.627092,25.788962,200.990149,870.005013,1282.000607,1462.999546,711.986859,...,1.693027,0.892082,32.216750,0.144678,5.661194,2.838643,90.322197,1.674484,-0.166778,4.158888
538,23.831333,25.013839,20.360630,22.573903,25.654084,201.032172,870.025303,1281.991441,1462.991303,711.982986,...,1.487321,1.132083,31.915536,0.830148,5.422269,3.048772,89.974595,2.188460,0.427537,3.811684


In [9]:
sample_dataset = df_samples.copy()
sample_dataset[sample_dataset.columns[:NUM_LABEL_COLS]] = sample_dataset[sample_dataset.columns[:NUM_LABEL_COLS]].clip(0) / 100
sample_dataset[sample_dataset.columns[NUM_LABEL_COLS:]] = sample_dataset[sample_dataset.columns[NUM_LABEL_COLS:]].apply(normalize_column, axis=0)

sample_dataset.to_csv('../data/strawberry_samples_small.csv')
sample_dataset

Unnamed: 0,OVERALL LIKING,TEXTURE LIKING,SWEETNESS INTENSITY,SOURNESS INTENSITY,STRAWBERRY FLAVOR INTENSITY,6915-15-7,77-92-9,50-99-7,57-48-7,57-50-1,...,7786-58-5,15111-96-3,706-14-9,10522-34-6,5881-17-4,128-37-0,40716-66-3,4887-30-3,5454-09-1,2305-05-7
0,0.308428,0.248398,0.276892,0.146348,0.304874,-2.118478,0.170950,0.758489,0.611502,0.314165,...,-0.274696,-0.291693,-0.544876,-0.624615,1.357599,-0.887048,0.117988,-0.129697,0.803508,-0.374205
1,0.306749,0.249338,0.276857,0.145572,0.305135,-2.117901,0.171123,0.758463,0.611515,0.314137,...,-0.270216,-0.287849,-0.544251,-0.693461,1.360433,-0.900900,0.119259,-0.127262,0.806233,-0.394632
2,0.307114,0.248186,0.278256,0.144508,0.306206,-2.117991,0.170873,0.758468,0.611499,0.314048,...,-0.272988,-0.396558,-0.545947,-0.649198,1.313429,-0.984084,0.119577,-0.130490,0.812682,-0.384367
3,0.308399,0.251001,0.277324,0.147741,0.305610,-2.117805,0.170822,0.758464,0.611491,0.314145,...,-0.275388,-0.259077,-0.544054,-0.414341,1.391113,-1.005550,0.122263,-0.127503,0.815506,-0.373437
4,0.307796,0.249227,0.276693,0.145135,0.306120,-2.118117,0.170877,0.758482,0.611496,0.314105,...,-0.280468,-0.400374,-0.543970,-0.651665,1.427248,-0.943730,0.118603,-0.126591,0.797964,-0.376500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,0.237788,0.249251,0.205079,0.226829,0.257048,-0.221716,0.883531,-0.833549,-0.802703,-0.624953,...,-0.396740,0.026101,-0.151461,-0.654872,-0.246593,-0.516054,0.049474,-0.462792,-0.497444,-0.468190
536,0.236552,0.251141,0.204685,0.226526,0.257617,-0.222250,0.883553,-0.833500,-0.802697,-0.624959,...,-0.390921,-0.199728,-0.159716,-0.866475,-0.364984,-0.141663,0.056479,-0.465232,-0.450930,-0.450540
537,0.236757,0.250230,0.202810,0.226271,0.257890,-0.222110,0.883596,-0.833549,-0.802701,-0.624977,...,-0.374108,-0.283317,-0.152848,-0.901441,-0.211908,-0.352617,0.056505,-0.473746,-0.637578,-0.384176
538,0.238313,0.250138,0.203606,0.225739,0.256541,-0.221289,0.883735,-0.833573,-0.802723,-0.624983,...,-0.381419,-0.079953,-0.156594,-0.275598,-0.306975,-0.295424,0.053244,-0.457785,-0.540563,-0.436329
