In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
def normalize_column(col):
    return (col - col.mean()) / col.std()

In [3]:
raw_features = pd.read_csv('../data/beer_features.csv', index_col=0)
raw_labels_panel = pd.read_csv('../data/beer_labels_panel.csv', index_col=0)
raw_labels_ratebeer = pd.read_csv('../data/beer_labels_ratebeer.csv', index_col=0)

In [4]:
filled_features = raw_features.copy().dropna(axis=1, how='all')
filled_features[filled_features.columns[2:]] = filled_features[filled_features.columns[2:]].fillna(filled_features[filled_features.columns[2:]].median()).apply(normalize_column, axis=0)
filled_labels_panel = raw_labels_panel.copy().dropna(axis=1, how='all')
filled_labels_panel[filled_labels_panel.columns[2:]] = filled_labels_panel[filled_labels_panel.columns[2:]].fillna(filled_labels_panel[filled_labels_panel.columns[2:]].median()).apply(normalize_column, axis=0)
filled_labels_ratebeer = raw_labels_ratebeer.copy().dropna(axis=1, how='all')
filled_labels_ratebeer[filled_labels_ratebeer.columns[2:]] = filled_labels_ratebeer[filled_labels_ratebeer.columns[2:]].fillna(filled_labels_ratebeer[filled_labels_ratebeer.columns[2:]].median()).apply(normalize_column, axis=0)

In [5]:
# The -0.01 makes it an even split 175,75. It changes 2 types with 5 beers from 4,1 to 3,2.
beer_type_splits = [round(i * 0.7 - 0.01) for i in list(Counter(list(filled_labels_panel['tasting_category_fine'].sort_values())).values())]

# -------------------- FEATURES --------------------
# Sort the DataFrame by 'tasting_category_fine'
sorted_features = filled_features.sort_values(by='tasting_category_fine')

# Initialize empty DataFrames for the splits
features_70 = pd.DataFrame(columns=filled_features.columns)
features_30 = pd.DataFrame(columns=filled_features.columns)

# Iterate over unique values in 'tasting_category_fine' and split them according to beer_type_splits
for category, count in zip(sorted_features['tasting_category_fine'].unique(), beer_type_splits):
    category_subset = sorted_features[sorted_features['tasting_category_fine'] == category]
    features_70 = pd.concat([features_70, category_subset.head(count)])
    features_30 = pd.concat([features_30, category_subset.tail(len(category_subset) - count)])

features_70 = features_70.drop(columns=['beer_id', 'tasting_category_fine'])
features_30 = features_30.drop(columns=['beer_id', 'tasting_category_fine'])


# -------------------- LABELS PANEL --------------------
# Sort the DataFrame by 'tasting_category_fine'
sorted_labels_panel = filled_labels_panel.sort_values(by='tasting_category_fine')

# Initialize empty DataFrames for the splits
labels_panel_70 = pd.DataFrame(columns=filled_labels_panel.columns)
labels_panel_30 = pd.DataFrame(columns=filled_labels_panel.columns)

# Iterate over unique values in 'tasting_category_fine' and split them according to beer_type_splits
for category, count in zip(sorted_labels_panel['tasting_category_fine'].unique(), beer_type_splits):
    category_subset = sorted_labels_panel[sorted_labels_panel['tasting_category_fine'] == category]
    labels_panel_70 = pd.concat([labels_panel_70, category_subset.head(count)])
    labels_panel_30 = pd.concat([labels_panel_30, category_subset.tail(len(category_subset) - count)])

labels_panel_70 = labels_panel_70.drop(columns=['beer_id', 'tasting_category_fine'])
labels_panel_30 = labels_panel_30.drop(columns=['beer_id', 'tasting_category_fine'])


# -------------------- LABELS RATEBEER --------------------
# Sort the DataFrame by 'tasting_category_fine'
sorted_labels_ratebeer = filled_labels_ratebeer.sort_values(by='tasting_category_fine')

# Initialize empty DataFrames for the splits
labels_ratebeer_70 = pd.DataFrame(columns=filled_labels_ratebeer.columns)
labels_ratebeer_30 = pd.DataFrame(columns=filled_labels_ratebeer.columns)

# Iterate over unique values in 'tasting_category_fine' and split them according to beer_type_splits
for category, count in zip(sorted_labels_ratebeer['tasting_category_fine'].unique(), beer_type_splits):
    category_subset = sorted_labels_ratebeer[sorted_labels_ratebeer['tasting_category_fine'] == category]
    labels_ratebeer_70 = pd.concat([labels_ratebeer_70, category_subset.head(count)])
    labels_ratebeer_30 = pd.concat([labels_ratebeer_30, category_subset.tail(len(category_subset) - count)])

labels_ratebeer_70 = labels_ratebeer_70.drop(columns=['beer_id', 'tasting_category_fine'])
labels_ratebeer_30 = labels_ratebeer_30.drop(columns=['beer_id', 'tasting_category_fine'])

  features_70 = pd.concat([features_70, category_subset.head(count)])
  features_30 = pd.concat([features_30, category_subset.tail(len(category_subset) - count)])
  labels_panel_70 = pd.concat([labels_panel_70, category_subset.head(count)])
  labels_panel_30 = pd.concat([labels_panel_30, category_subset.tail(len(category_subset) - count)])
  labels_ratebeer_70 = pd.concat([labels_ratebeer_70, category_subset.head(count)])
  labels_ratebeer_30 = pd.concat([labels_ratebeer_30, category_subset.tail(len(category_subset) - count)])


In [6]:
# np.random.seed(40)
# rand_ordering = np.random.random(175).argsort()

# features_70 = features_70.iloc[rand_ordering]
# labels_panel_70 = labels_panel_70.iloc[rand_ordering]
# labels_ratebeer_70 = labels_ratebeer_70.iloc[rand_ordering]

In [7]:
features_70.to_csv('../data/beer_features_train.csv', index=True)
features_30.to_csv('../data/beer_features_test.csv', index=True)
labels_panel_70.to_csv('../data/beer_labels_panel_train.csv', index=True)
labels_panel_30.to_csv('../data/beer_labels_panel_test.csv', index=True)
labels_ratebeer_70.to_csv('../data/beer_labels_ratebeer_train.csv', index=True)
labels_ratebeer_30.to_csv('../data/beer_labels_ratebeer_test.csv', index=True)

In [8]:
sample_dfs = []
for df in [features_70, labels_panel_70, labels_ratebeer_70]:
    df_samples = pd.DataFrame()
    stderr_row = df.var()

    # For each row in the original DataFrames
    for idx in range(len(df)):
        # Get mean and standard error for the current row
        mean_row = df.iloc[idx]

        # Generate 10 samples from the normal distribution with the mean and standard error
        samples = pd.DataFrame({column: np.random.normal(mean, stderr/3, 100) for column, mean, stderr in zip(df.columns, mean_row, stderr_row)})

        # Append the samples to the new DataFrame
        df_samples = pd.concat([df_samples, samples], ignore_index=True)

    sample_dfs.append(df_samples)

In [9]:
features_70_samples, labels_panel_70_samples, labels_ratebeer_70_samples = sample_dfs

In [10]:
features_70_samples

Unnamed: 0,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,ethyl_hexanoate,ethyl_isovalerate,ethyl_octanoate,...,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,esters_sum,aroma_hops_sum,acids_sum,sulfur_sum
0,0.723917,2.157005,0.865140,-0.705520,1.217546,0.836088,0.432043,0.387361,-1.272884,-0.129583,...,0.650134,1.046618,0.629327,0.359717,-0.138764,-0.310415,0.924510,-0.258325,-0.673084,0.444449
1,1.040447,1.937354,1.253434,-1.111823,1.495353,0.054823,-0.133475,-0.153616,-1.366454,0.429864,...,0.716743,0.954631,0.375570,-0.458813,0.161928,-0.074342,0.917160,-0.212482,-0.158204,0.353476
2,0.877453,1.388782,1.409721,-1.121911,0.379676,0.378245,0.206465,0.705243,-1.224911,-0.266834,...,0.162717,1.159595,0.367255,0.329993,0.019964,-0.001901,0.939339,-0.174072,-0.751579,0.070969
3,0.854315,1.865319,0.571484,-0.719855,1.158614,0.667239,-0.077326,0.039295,-1.003314,-0.020170,...,0.899410,0.814260,0.363183,-0.541350,0.455797,-0.369960,1.160774,-0.503639,-0.190795,0.494800
4,0.715892,2.028350,1.508485,-0.823090,1.359465,0.396890,-0.198318,0.760185,-0.877198,0.363673,...,1.110721,0.787628,0.721809,0.431743,0.183550,-0.511779,0.976300,-0.896286,-0.380800,0.438547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17495,-0.540395,0.180719,-0.149725,-0.251298,-1.242885,-0.006258,0.775127,-1.488608,-1.290088,1.245840,...,-2.687540,-1.008811,0.294563,0.183632,0.029572,-0.310452,0.705048,1.804316,-0.602089,-1.453156
17496,-1.389693,-0.560067,-0.433854,-0.820856,-1.312697,-0.623480,0.664781,-0.865653,-1.651682,0.378200,...,-2.882139,-1.032309,0.223312,0.441727,-0.264994,-1.031888,-0.125924,1.759859,-0.756542,-1.659356
17497,-0.412328,-0.896722,0.197726,-0.193548,-0.952195,-0.025136,0.896882,-1.323347,-1.907096,0.911730,...,-2.509603,-0.360106,0.741378,0.151158,-0.392297,0.542939,0.067099,1.225630,0.041642,-1.731997
17498,-0.387334,0.044066,-0.494053,-1.162118,-0.963378,0.054127,0.388616,-0.865618,-1.539283,1.083812,...,-1.492464,-0.758092,0.426161,0.584430,0.219468,0.635910,0.431110,2.090007,-0.759867,-1.820645


In [11]:
features_70_samples.to_csv('../data/beer_features_train_samples.csv', index=True)
labels_panel_70_samples.to_csv('../data/beer_labels_panel_train_samples.csv', index=True)
labels_ratebeer_70_samples.to_csv('../data/beer_labels_ratebeer_train_samples.csv', index=True)