In [1]:
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
def normalize_column(col):
    return (col - col.mean()) / col.std()

In [3]:
raw_features = pd.read_csv('../data/beer_features.csv', index_col=0)
raw_labels_panel = pd.read_csv('../data/beer_labels_panel.csv', index_col=0)
raw_labels_ratebeer = pd.read_csv('../data/beer_labels_ratebeer.csv', index_col=0)

In [4]:
filled_features = raw_features.copy().dropna(axis=1, how='all')
filled_features[filled_features.columns[2:]] = filled_features[filled_features.columns[2:]].fillna(filled_features[filled_features.columns[2:]].median()).apply(normalize_column, axis=0)
filled_labels_panel = raw_labels_panel.copy().dropna(axis=1, how='all')
filled_labels_panel[filled_labels_panel.columns[2:]] = filled_labels_panel[filled_labels_panel.columns[2:]].fillna(filled_labels_panel[filled_labels_panel.columns[2:]].median()).apply(normalize_column, axis=0)
filled_labels_ratebeer = raw_labels_ratebeer.copy().dropna(axis=1, how='all')
filled_labels_ratebeer[filled_labels_ratebeer.columns[2:]] = filled_labels_ratebeer[filled_labels_ratebeer.columns[2:]].fillna(filled_labels_ratebeer[filled_labels_ratebeer.columns[2:]].median()).apply(normalize_column, axis=0)

In [5]:
# The -0.01 makes it an even split 175,75. It changes 2 types with 5 beers from 4,1 to 3,2.
beer_type_splits = [round(i * 0.7 - 0.01) for i in list(Counter(list(filled_labels_panel['tasting_category_fine'].sort_values())).values())]

# -------------------- FEATURES --------------------
# Sort the DataFrame by 'tasting_category_fine'
sorted_features = filled_features.sort_values(by='tasting_category_fine')

# Initialize empty DataFrames for the splits
features_70 = pd.DataFrame(columns=filled_features.columns)
features_30 = pd.DataFrame(columns=filled_features.columns)

# Iterate over unique values in 'tasting_category_fine' and split them according to beer_type_splits
for category, count in zip(sorted_features['tasting_category_fine'].unique(), beer_type_splits):
    category_subset = sorted_features[sorted_features['tasting_category_fine'] == category]
    features_70 = pd.concat([features_70, category_subset.head(count)])
    features_30 = pd.concat([features_30, category_subset.tail(len(category_subset) - count)])

features_70 = features_70.drop(columns=['beer_id'])
features_30 = features_30.drop(columns=['beer_id'])


# -------------------- LABELS PANEL --------------------
# Sort the DataFrame by 'tasting_category_fine'
sorted_labels_panel = filled_labels_panel.sort_values(by='tasting_category_fine')

# Initialize empty DataFrames for the splits
labels_panel_70 = pd.DataFrame(columns=filled_labels_panel.columns)
labels_panel_30 = pd.DataFrame(columns=filled_labels_panel.columns)

# Iterate over unique values in 'tasting_category_fine' and split them according to beer_type_splits
for category, count in zip(sorted_labels_panel['tasting_category_fine'].unique(), beer_type_splits):
    category_subset = sorted_labels_panel[sorted_labels_panel['tasting_category_fine'] == category]
    labels_panel_70 = pd.concat([labels_panel_70, category_subset.head(count)])
    labels_panel_30 = pd.concat([labels_panel_30, category_subset.tail(len(category_subset) - count)])

labels_panel_70 = labels_panel_70.drop(columns=['beer_id'])
labels_panel_30 = labels_panel_30.drop(columns=['beer_id'])


# -------------------- LABELS RATEBEER --------------------
# Sort the DataFrame by 'tasting_category_fine'
sorted_labels_ratebeer = filled_labels_ratebeer.sort_values(by='tasting_category_fine')

# Initialize empty DataFrames for the splits
labels_ratebeer_70 = pd.DataFrame(columns=filled_labels_ratebeer.columns)
labels_ratebeer_30 = pd.DataFrame(columns=filled_labels_ratebeer.columns)

# Iterate over unique values in 'tasting_category_fine' and split them according to beer_type_splits
for category, count in zip(sorted_labels_ratebeer['tasting_category_fine'].unique(), beer_type_splits):
    category_subset = sorted_labels_ratebeer[sorted_labels_ratebeer['tasting_category_fine'] == category]
    labels_ratebeer_70 = pd.concat([labels_ratebeer_70, category_subset.head(count)])
    labels_ratebeer_30 = pd.concat([labels_ratebeer_30, category_subset.tail(len(category_subset) - count)])

labels_ratebeer_70 = labels_ratebeer_70.drop(columns=['beer_id'])
labels_ratebeer_30 = labels_ratebeer_30.drop(columns=['beer_id'])

  features_70 = pd.concat([features_70, category_subset.head(count)])
  features_30 = pd.concat([features_30, category_subset.tail(len(category_subset) - count)])
  labels_panel_70 = pd.concat([labels_panel_70, category_subset.head(count)])
  labels_panel_30 = pd.concat([labels_panel_30, category_subset.tail(len(category_subset) - count)])
  labels_ratebeer_70 = pd.concat([labels_ratebeer_70, category_subset.head(count)])
  labels_ratebeer_30 = pd.concat([labels_ratebeer_30, category_subset.tail(len(category_subset) - count)])


In [6]:
features_70.to_csv('../data/beer_features_train.csv', index=True)
features_30.to_csv('../data/beer_features_test.csv', index=True)
labels_panel_70.to_csv('../data/beer_labels_panel_train.csv', index=True)
labels_panel_30.to_csv('../data/beer_labels_panel_test.csv', index=True)
labels_ratebeer_70.to_csv('../data/beer_labels_ratebeer_train.csv', index=True)
labels_ratebeer_30.to_csv('../data/beer_labels_ratebeer_test.csv', index=True)

In [43]:
sample_dfs = []
for df in [features_70, labels_panel_70, labels_ratebeer_70]:
    df_samples = pd.DataFrame()
    stderr_row = df.var()

    # For each row in the original DataFrames
    for idx in range(len(df)):
        # Get mean and standard error for the current row
        mean_row = df.iloc[idx]

        # Generate 10 samples from the normal distribution with the mean and standard error
        samples = pd.DataFrame({column: np.random.normal(mean, stderr/20, 10) for column, mean, stderr in zip(df.columns, mean_row, stderr_row)})

        # Append the samples to the new DataFrame
        df_samples = pd.concat([df_samples, samples], ignore_index=True)

    sample_dfs.append(df_samples)

In [44]:
features_70_samples, labels_panel_70_samples, labels_ratebeer_70_samples = sample_dfs

In [45]:
features_70_samples

Unnamed: 0,acetaldehyde,CS2,DES,DMS,ethyl_2.methyl_butyrate,ethyl_acetate,ethyl_decanoate,ethyl_hexanoate,ethyl_isovalerate,ethyl_octanoate,...,X13.epi.manool.oxide,isopropyl.hexadecanoate,manool,ethyl.octadecanoate,ethyl.pentanoate,X2.ethyl.3.methylpyrazine,esters_sum,aroma_hops_sum,acids_sum,sulfur_sum
0,0.768929,1.931457,1.131412,-0.497964,0.929873,0.388778,0.211625,0.305334,-1.311983,-0.074176,...,0.560269,0.993577,0.323634,0.017356,0.047945,-0.305189,1.040000,-0.742558,-0.510447,0.225670
1,0.638367,1.922102,1.103681,-0.485864,0.913474,0.362968,0.121518,0.349132,-1.402993,0.184704,...,0.582851,1.017333,0.336855,0.049047,0.075347,-0.261698,1.095008,-0.804629,-0.645513,0.070971
2,0.662386,1.908234,1.230249,-0.384357,0.841182,0.337419,0.061780,0.379078,-1.425558,0.034610,...,0.546245,0.922966,0.371929,0.137122,0.015548,-0.327649,1.059013,-0.774475,-0.632254,0.208730
3,0.747062,1.840599,1.124726,-0.512308,0.863280,0.300948,0.101315,0.316130,-1.307097,0.140153,...,0.634933,1.016906,0.331357,0.089431,0.090669,-0.250808,1.049735,-0.802919,-0.569503,0.186875
4,0.785507,1.835175,1.151549,-0.461324,0.844923,0.269458,0.172163,0.426228,-1.353882,0.082114,...,0.516568,0.999798,0.332665,0.231799,0.112504,-0.268635,1.067480,-0.837687,-0.620044,0.090461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1745,-0.614821,-0.368275,0.073538,-0.368349,-1.198149,-0.097451,0.779655,-0.951215,-1.451398,0.789561,...,-2.280743,-0.868391,0.224627,0.290107,0.047739,-0.333113,0.496220,1.893228,-0.408027,-1.632254
1746,-0.569969,-0.539356,0.001792,-0.354284,-1.257480,-0.055960,0.868307,-1.013124,-1.434832,0.738092,...,-2.215127,-0.772107,0.285029,0.207165,-0.076895,-0.175145,0.361220,2.063946,-0.430726,-1.638068
1747,-0.587853,-0.317465,0.107559,-0.298089,-1.235906,-0.059947,0.805780,-0.950692,-1.368663,0.733423,...,-2.166840,-0.978973,0.206717,0.168730,-0.025376,-0.258496,0.555025,1.980340,-0.353029,-1.705623
1748,-0.619824,-0.424737,0.082030,-0.345706,-1.160491,-0.070447,0.803483,-0.974554,-1.423364,0.672761,...,-2.199425,-0.887272,0.265224,0.181043,-0.023954,-0.265178,0.445077,1.939202,-0.375733,-1.589875


In [42]:
features_70_samples.to_csv('../data/beer_features_train_samples_small.csv', index=True)
labels_panel_70_samples.to_csv('../data/beer_labels_panel_train_samples_small.csv', index=True)
labels_ratebeer_70_samples.to_csv('../data/beer_labels_ratebeer_train_samples_small.csv', index=True)