In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
def normalize_column(col):
    return (col - col.mean()) / col.std()

In [3]:
raw_features = pd.read_csv('../data/beer_features.csv', index_col=0)
raw_labels_panel = pd.read_csv('../data/beer_labels_panel.csv', index_col=0)
raw_labels_ratebeer = pd.read_csv('../data/beer_labels_ratebeer.csv', index_col=0)

In [44]:
filled_features = raw_features.copy().dropna(axis=1, how='all')
filled_features[filled_features.columns[2:]] = filled_features[filled_features.columns[2:]].fillna(filled_features[filled_features.columns[2:]].median()).apply(normalize_column, axis=0)
filled_labels_panel = raw_labels_panel.copy().dropna(axis=1, how='all')
filled_labels_panel[filled_labels_panel.columns[2:]] = filled_labels_panel[filled_labels_panel.columns[2:]].fillna(filled_labels_panel[filled_labels_panel.columns[2:]].median()).apply(normalize_column, axis=0)
filled_labels_ratebeer = raw_labels_ratebeer.copy().dropna(axis=1, how='all')
filled_labels_ratebeer[filled_labels_ratebeer.columns[2:]] = filled_labels_ratebeer[filled_labels_ratebeer.columns[2:]].fillna(filled_labels_ratebeer[filled_labels_ratebeer.columns[2:]].median()).apply(normalize_column, axis=0)

In [45]:
# The -0.01 makes it an even split 175,75. It changes 2 types with 5 beers from 4,1 to 3,2.
beer_type_splits = [round(i * 0.7 - 0.01) for i in list(Counter(list(filled_labels_panel['tasting_category_fine'].sort_values())).values())]

# -------------------- FEATURES --------------------
# Sort the DataFrame by 'tasting_category_fine'
sorted_features = filled_features.sort_values(by='tasting_category_fine')

# Initialize empty DataFrames for the splits
features_70 = pd.DataFrame(columns=filled_features.columns)
features_30 = pd.DataFrame(columns=filled_features.columns)

# Iterate over unique values in 'tasting_category_fine' and split them according to beer_type_splits
for category, count in zip(sorted_features['tasting_category_fine'].unique(), beer_type_splits):
    category_subset = sorted_features[sorted_features['tasting_category_fine'] == category]
    features_70 = pd.concat([features_70, category_subset.head(count)])
    features_30 = pd.concat([features_30, category_subset.tail(len(category_subset) - count)])

features_70 = features_70.drop(columns=['beer_id', 'tasting_category_fine'])
features_30 = features_30.drop(columns=['beer_id', 'tasting_category_fine'])


# -------------------- LABELS PANEL --------------------
# Sort the DataFrame by 'tasting_category_fine'
sorted_labels_panel = filled_labels_panel.sort_values(by='tasting_category_fine')

# Initialize empty DataFrames for the splits
labels_panel_70 = pd.DataFrame(columns=filled_labels_panel.columns)
labels_panel_30 = pd.DataFrame(columns=filled_labels_panel.columns)

# Iterate over unique values in 'tasting_category_fine' and split them according to beer_type_splits
for category, count in zip(sorted_labels_panel['tasting_category_fine'].unique(), beer_type_splits):
    category_subset = sorted_labels_panel[sorted_labels_panel['tasting_category_fine'] == category]
    labels_panel_70 = pd.concat([labels_panel_70, category_subset.head(count)])
    labels_panel_30 = pd.concat([labels_panel_30, category_subset.tail(len(category_subset) - count)])

labels_panel_70 = labels_panel_70.drop(columns=['beer_id', 'tasting_category_fine'])
labels_panel_30 = labels_panel_30.drop(columns=['beer_id', 'tasting_category_fine'])


# -------------------- LABELS RATEBEER --------------------
# Sort the DataFrame by 'tasting_category_fine'
sorted_labels_ratebeer = filled_labels_ratebeer.sort_values(by='tasting_category_fine')

# Initialize empty DataFrames for the splits
labels_ratebeer_70 = pd.DataFrame(columns=filled_labels_ratebeer.columns)
labels_ratebeer_30 = pd.DataFrame(columns=filled_labels_ratebeer.columns)

# Iterate over unique values in 'tasting_category_fine' and split them according to beer_type_splits
for category, count in zip(sorted_labels_ratebeer['tasting_category_fine'].unique(), beer_type_splits):
    category_subset = sorted_labels_ratebeer[sorted_labels_ratebeer['tasting_category_fine'] == category]
    labels_ratebeer_70 = pd.concat([labels_ratebeer_70, category_subset.head(count)])
    labels_ratebeer_30 = pd.concat([labels_ratebeer_30, category_subset.tail(len(category_subset) - count)])

labels_ratebeer_70 = labels_ratebeer_70.drop(columns=['beer_id', 'tasting_category_fine'])
labels_ratebeer_30 = labels_ratebeer_30.drop(columns=['beer_id', 'tasting_category_fine'])

  features_70 = pd.concat([features_70, category_subset.head(count)])
  features_30 = pd.concat([features_30, category_subset.tail(len(category_subset) - count)])
  labels_panel_70 = pd.concat([labels_panel_70, category_subset.head(count)])
  labels_panel_30 = pd.concat([labels_panel_30, category_subset.tail(len(category_subset) - count)])
  labels_ratebeer_70 = pd.concat([labels_ratebeer_70, category_subset.head(count)])
  labels_ratebeer_30 = pd.concat([labels_ratebeer_30, category_subset.tail(len(category_subset) - count)])


In [54]:
features_70.to_csv('../data/beer_features_train.csv', index=True)
features_30.to_csv('../data/beer_features_test.csv', index=True)
labels_panel_70.to_csv('../data/beer_labels_panel_train.csv', index=True)
labels_panel_30.to_csv('../data/beer_labels_panel_test.csv', index=True)
labels_ratebeer_70.to_csv('../data/beer_labels_ratebeer_train.csv', index=True)
labels_ratebeer_30.to_csv('../data/beer_labels_ratebeer_test.csv', index=True)

In [107]:
sample_dfs = []
for df in [features_70, labels_panel_70, labels_ratebeer_70]:
    df_samples = pd.DataFrame()
    stderr_row = df.var()

    # For each row in the original DataFrames
    for idx in range(len(df)):
        # Get mean and standard error for the current row
        mean_row = df.iloc[idx]

        # Generate 10 samples from the normal distribution with the mean and standard error
        samples = pd.DataFrame({column: np.random.normal(mean, stderr/3, 1000) for column, mean, stderr in zip(df.columns, mean_row, stderr_row)})

        # Append the samples to the new DataFrame
        df_samples = pd.concat([df_samples, samples], ignore_index=True)

    sample_dfs.append(df_samples)

In [108]:
features_70_samples, labels_panel_70_samples, labels_ratebeer_70_samples = sample_dfs

In [113]:
features_70_samples

Unnamed: 0,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,ethyl_hexanoate,ethyl_isovalerate,ethyl_octanoate,...,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,esters_sum,aroma_hops_sum,acids_sum,sulfur_sum
0,0.137263,2.111758,1.356607,-0.679158,0.693241,0.277718,-0.265629,0.677588,-1.107079,0.276782,...,1.080189,0.988907,-0.246249,-0.092916,0.166456,-1.129896,0.842379,-0.673961,-0.369134,0.284455
1,0.796843,2.141665,1.489927,0.236932,1.012853,0.699626,0.558686,0.314887,-1.078908,-0.224578,...,1.155615,0.790090,0.042920,0.446052,0.211737,-0.513843,0.890721,-0.322252,-1.070950,-0.140082
2,0.732025,1.824798,1.146844,0.246112,0.987048,0.760973,-0.077347,0.239452,-1.690654,-0.027911,...,0.936723,0.669805,0.472933,0.108043,-0.240461,-0.584322,0.938582,-0.939591,-0.105423,0.107900
3,0.182828,2.025728,1.825546,-0.299691,1.024168,-0.030109,0.355507,-0.351859,-1.032741,0.186438,...,0.984349,1.086476,0.311443,0.129020,0.061212,0.242111,1.310760,-1.378176,-0.483464,-0.190876
4,0.785869,1.764742,1.305147,-0.940589,1.199088,0.551711,0.496318,0.309091,-0.924309,0.302533,...,0.314060,1.260160,0.994967,0.191480,0.163638,0.222171,1.020012,-1.051831,-0.525196,0.528762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174995,-0.601381,-0.162888,-0.359789,-0.623490,-1.393885,0.275840,1.091248,-0.340363,-0.842491,0.591973,...,-1.398199,-0.742262,0.079675,-0.213720,-0.608536,-0.082804,0.579684,2.116531,-0.140879,-1.637406
174996,-0.617003,-0.670610,-0.252541,-0.343415,-0.617667,-0.520179,0.940580,-0.920266,-1.659446,0.456143,...,-2.154219,-0.972296,0.367741,0.387139,0.490763,-0.387586,0.295218,1.958585,-0.652866,-1.723617
174997,-0.792362,-1.066990,0.013224,0.065044,-1.418899,0.216356,0.750692,-1.054458,-1.076792,0.614780,...,-2.364199,-0.698755,0.012654,-0.335428,-0.417211,-0.088326,-0.141486,2.096593,-0.536467,-2.365872
174998,-0.284062,-0.309231,-0.116113,-0.454248,-1.390896,0.262868,0.717028,-0.662023,-0.784148,0.509412,...,-2.494558,-1.107528,-0.005152,-0.296488,-0.261110,-0.198422,0.771307,2.029813,-0.266866,-1.785486


In [114]:
features_70_samples.to_csv('../data/beer_features_train_samples.csv', index=True)
labels_panel_70_samples.to_csv('../data/beer_labels_panel_train_samples.csv', index=True)
labels_ratebeer_70_samples.to_csv('../data/beer_labels_ratebeer_train_samples.csv', index=True)