In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_mean = pd.read_csv('../data/Strawberry_mean.csv')
raw_stderr = pd.read_csv('../data/Strawberry_stderr.csv')

In [3]:
drop_cols = ['CULTIVAR','SEASON','HARVEST WEEK','HARVEST DATE','MEASURE', *raw_stderr.columns[raw_stderr.isna().any()]]
df_mean = raw_mean.drop(columns=drop_cols)
df_mean = df_mean.replace('-', np.nan).astype(float)
df_mean = df_mean.fillna(df_mean.mean())

df_stderr = raw_stderr.drop(columns=drop_cols)
df_stderr = df_stderr.replace('-', np.nan).astype(float)
df_stderr = df_stderr.fillna(df_stderr.mean())

drop_cols

['CULTIVAR',
 'SEASON',
 'HARVEST WEEK',
 'HARVEST DATE',
 'MEASURE',
 'L* ext',
 'A* ext',
 'B* ext',
 'FORCE',
 'L* int',
 'A* int',
 'B* int',
 'TOTAL SUGAR',
 'TOTAL VOLATILES']

In [4]:
def normalize_column(col):
    return (col - col.mean()) / col.std()

In [7]:
val_dataset = df_mean.copy()
val_dataset[val_dataset.columns[:5]] = val_dataset[val_dataset.columns[:5]] / 100
val_dataset[val_dataset.columns[5:]] = val_dataset[val_dataset.columns[5:]].apply(normalize_column, axis=0)

val_dataset
val_dataset.to_csv('../data/strawberry_val_dataset.csv')

In [14]:
df_samples = pd.DataFrame()

# For each row in the original DataFrames
for idx in range(len(df_mean)):
    # Get mean and standard error for the current row
    mean_row = df_mean.iloc[idx]
    stderr_row = df_stderr.iloc[idx]

    # Generate 10 samples from the normal distribution with the mean and standard error
    samples = pd.DataFrame({column: np.random.normal(mean, np.divide(stderr, mean, out=np.zeros_like(mean), where=mean!=0), 1000) for column, mean, stderr in zip(df_mean.columns, mean_row, stderr_row)})

    # Append the samples to the new DataFrame
    df_samples = pd.concat([df_samples, samples], ignore_index=True)

df_samples

Unnamed: 0,OVERALL LIKING,TEXTURE LIKING,SWEETNESS INTENSITY,SOURNESS INTENSITY,STRAWBERRY FLAVOR INTENSITY,6915-15-7,77-92-9,50-99-7,57-48-7,57-50-1,...,7786-58-5,15111-96-3,706-14-9,10522-34-6,5881-17-4,128-37-0,40716-66-3,4887-30-3,5454-09-1,2305-05-7
0,30.706841,25.017388,27.664715,14.621355,30.502116,103.994864,765.998883,1878.998375,1998.003984,1313.975248,...,4.583012,0.691887,0.774607,0.622982,9.431509,0.661870,97.302185,12.837743,8.662852,4.350883
1,30.785940,24.902342,27.610118,14.715056,30.636353,104.017534,766.014013,1879.004307,1997.994587,1313.978341,...,4.438700,0.820774,0.670431,0.508070,9.573480,0.853910,96.778999,12.742840,8.636586,4.222639
2,30.634788,24.875553,27.711495,14.656818,30.671894,104.013164,766.014083,1878.996935,1997.998374,1314.011388,...,4.471196,0.931250,0.718012,0.457882,9.415576,0.470444,96.953488,12.799541,8.753754,4.333602
3,30.769405,24.822714,27.760656,14.713455,30.527598,104.013444,766.009089,1878.992117,1997.999921,1313.969747,...,4.525824,0.846595,0.679025,0.513976,9.565830,0.612552,96.887125,12.848172,8.700509,4.200562
4,30.805519,24.907046,27.663845,14.569192,30.570713,103.981753,765.985780,1878.988943,1998.003321,1314.004936,...,4.541342,0.967256,0.695423,0.485329,10.080005,0.823352,97.087568,12.847352,8.680841,4.151972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53995,23.739517,24.978497,20.502480,22.508135,25.880045,200.988173,870.007859,1281.994009,1463.000927,712.010012,...,1.866698,0.830318,32.753985,1.022261,6.288580,2.636506,90.199740,2.346651,0.965766,3.746951
53996,24.006847,25.025613,20.447655,22.680357,25.751884,201.017069,869.987040,1282.007894,1463.000428,712.018677,...,1.122304,0.884869,32.014451,0.526411,5.549939,2.985536,89.850875,1.454067,0.564214,3.632213
53997,23.843973,24.985128,20.457387,22.421083,25.763199,201.005283,870.030305,1281.997851,1462.993738,712.009822,...,1.254102,0.842146,31.653197,0.536283,5.732866,3.572165,90.195726,1.778802,0.842333,3.527356
53998,23.895924,25.037618,20.249530,22.520246,25.808097,200.977240,870.007251,1282.001568,1463.002936,711.988540,...,1.742455,0.946150,31.849029,0.445537,5.600762,2.405357,89.956403,2.582248,0.690528,3.603569


In [15]:
sample_dataset = df_samples.copy()
sample_dataset[sample_dataset.columns[:5]] = sample_dataset[sample_dataset.columns[:5]] / 100
sample_dataset[sample_dataset.columns[5:]] = sample_dataset[sample_dataset.columns[5:]].apply(normalize_column, axis=0)

sample_dataset.to_csv('../data/strawberry_samples_big.csv')
sample_dataset

Unnamed: 0,OVERALL LIKING,TEXTURE LIKING,SWEETNESS INTENSITY,SOURNESS INTENSITY,STRAWBERRY FLAVOR INTENSITY,6915-15-7,77-92-9,50-99-7,57-48-7,57-50-1,...,7786-58-5,15111-96-3,706-14-9,10522-34-6,5881-17-4,128-37-0,40716-66-3,4887-30-3,5454-09-1,2305-05-7
0,0.307068,0.250174,0.276647,0.146214,0.305021,-2.120421,0.171179,0.759180,0.612070,0.314374,...,-0.271610,-0.443110,-0.544167,-0.463164,1.289539,-0.945803,0.122098,-0.127048,0.804970,-0.354773
1,0.307859,0.249023,0.276101,0.147151,0.306364,-2.119978,0.171282,0.759195,0.612046,0.314378,...,-0.276743,-0.334642,-0.545464,-0.568225,1.346308,-0.893509,0.117186,-0.129998,0.800679,-0.374048
2,0.306348,0.248756,0.277115,0.146568,0.306719,-2.120063,0.171283,0.759176,0.612056,0.314430,...,-0.275587,-0.241669,-0.544872,-0.614111,1.283168,-0.997929,0.118824,-0.128236,0.819822,-0.357370
3,0.307694,0.248227,0.277607,0.147135,0.305276,-2.120058,0.171249,0.759163,0.612060,0.314365,...,-0.273644,-0.312912,-0.545357,-0.562826,1.343249,-0.959232,0.118201,-0.126724,0.811123,-0.377366
4,0.308055,0.249070,0.276638,0.145692,0.305707,-2.120678,0.171089,0.759154,0.612069,0.314420,...,-0.273092,-0.211367,-0.545153,-0.589017,1.548846,-0.901830,0.120083,-0.126749,0.807910,-0.384669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53995,0.237395,0.249785,0.205025,0.225081,0.258800,-0.222374,0.884429,-0.834332,-0.803436,-0.625512,...,-0.368228,-0.326611,-0.146165,-0.098112,0.032811,-0.408101,0.055410,-0.453151,-0.452553,-0.445544
53996,0.240068,0.250256,0.204477,0.226804,0.257519,-0.221809,0.884286,-0.834295,-0.803437,-0.625499,...,-0.394705,-0.280702,-0.155369,-0.551456,-0.262542,-0.313059,0.052135,-0.480896,-0.518157,-0.462789
53997,0.238440,0.249851,0.204574,0.224211,0.257632,-0.222039,0.884583,-0.834322,-0.803455,-0.625512,...,-0.390017,-0.316657,-0.159865,-0.542431,-0.189396,-0.153318,0.055373,-0.470802,-0.472719,-0.478549
53998,0.238959,0.250376,0.202495,0.225202,0.258081,-0.222588,0.884425,-0.834312,-0.803431,-0.625546,...,-0.372647,-0.229130,-0.157428,-0.625398,-0.242219,-0.471044,0.053125,-0.445828,-0.497520,-0.467094
