In [1]:
# Machine Learning toolkit
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint

# Python SQL toolkit and Object Relational Mapper
import sqlite3

In [2]:
con = sqlite3.connect("./beer_data.sqlite")
c = con.cursor()

In [3]:
beers_df = pd.read_sql_query("SELECT * FROM taste_profiles", con)
reviews_df = pd.read_sql_query("SELECT * FROM reviews", con)

In [4]:
# Insert Description column
reviews_df.insert(3, 'Description', pd.read_sql_query("SELECT Description FROM beer_labels", con).values)

# Format Description column
remove_notes = [val.replace('Notes:', '') for val in reviews_df['Description'].values]
reviews_df['Description'] = remove_notes

reviews_df['Description'] = reviews_df['Description'].map(
    lambda desc: 'No description available.' if desc == '' else desc
)

In [5]:
# Round review scores to 2 decimal places
for col in reviews_df.columns[5:10]:
    reviews_df[col] = reviews_df[col].map("{:.2f}".format)
reviews_df

Unnamed: 0,Name,Brewery,Style,Description,ABV,review_overall,review_aroma,review_appearance,review_palate,review_taste,number_of_reviews
0,Amber,Alaskan Brewing Co.,Altbier,"Richly malty and long on the palate, with just...",5.3,3.85,3.50,3.64,3.56,3.64,497
1,Double Bag,Long Trail Brewing Co.,Altbier,"This malty, full-bodied double alt is also kno...",7.2,4.03,3.80,3.85,3.90,4.02,481
2,Long Trail Ale,Long Trail Brewing Co.,Altbier,Long Trail Ale is a full-bodied amber ale mode...,5.0,3.83,3.41,3.67,3.60,3.63,377
3,Doppelsticke,Uerige Obergärige Hausbrauerei GmbH / Zum Uerige,Altbier,No description available.,8.5,4.01,4.15,4.03,4.15,4.21,368
4,Sleigh'r Dark Doüble Alt Ale,Ninkasi Brewing Company,Altbier,Called 'Dark Double Alt' on the label.Seize th...,7.2,3.82,3.62,3.97,3.73,3.77,96
...,...,...,...,...,...,...,...,...,...,...,...
3192,Winter Shredder,Cisco Brewers Inc.,Winter Warmer,No description available.,8.8,3.73,4.12,3.88,3.88,3.75,4
3193,The First Snow Ale,RJ Rockers Brewing Company,Winter Warmer,This hearty American pale ale contains a rich ...,6.0,3.73,3.95,3.75,3.76,3.79,50
3194,Red Nose Winter Ale,Natty Greene's Pub & Brewing Co.,Winter Warmer,No description available.,6.8,3.71,3.58,3.71,3.60,3.67,26
3195,Fish Tale Winterfish,Fish Brewing Company / Fishbowl Brewpub,Winter Warmer,No description available.,7.5,3.88,3.90,3.89,3.86,3.90,87


In [6]:
# Check beer data
beers_df.head()

Unnamed: 0,Name,Style,Brewery,ABV,Min_IBU,Max_IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty
0,Amber,Altbier,Alaskan Brewing Co.,4.5,18,30,13,24,6,24,37,7,0,5,32,2,71
1,Amber,Altbier,Alaskan Brewing Co.,5.3,25,50,13,32,9,47,74,33,0,33,57,8,111
2,Double Bag,Altbier,Long Trail Brewing Co.,7.2,25,50,12,57,18,33,55,16,0,24,35,12,84
3,Long Trail Ale,Altbier,Long Trail Brewing Co.,5.0,25,50,14,37,6,42,43,11,0,10,54,4,62
4,Doppelsticke,Altbier,Uerige Obergärige Hausbrauerei GmbH / Zum Uerige,8.5,25,50,13,55,31,47,101,18,1,49,40,16,119


In [7]:
# Scale numerical values
scaler = MinMaxScaler()

def scale_col_by_row(df, cols):
    # Scale values by row
    scaled_cols = pd.DataFrame(scaler.fit_transform(df[cols].T).T, columns=cols)
    df[cols] = scaled_cols
    return df

def scale_col_by_col(df, cols):
    # Scale values by column
    scaled_cols = pd.DataFrame(scaler.fit_transform(df[cols]), columns=cols)
    df[cols] = scaled_cols
    return df

# Scale values in tasting profile features (across rows)
beers_df = scale_col_by_row(beers_df, beers_df.columns[6:])

# Scale values in tasting profile features (across columns)
beers_df = scale_col_by_col(beers_df, beers_df.columns[6:])

# Scale values in chemical features (across columns)
beers_df = scale_col_by_col(beers_df, beers_df.columns[3:6])

# Peek at re-scaled data
beers_df.head()

Unnamed: 0,Name,Style,Brewery,ABV,Min_IBU,Max_IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty
0,Amber,Altbier,Alaskan Brewing Co.,0.078261,0.276923,0.3,0.183099,0.338028,0.084507,0.338028,0.521127,0.098592,0.0,0.070423,0.450704,0.028169,1.0
1,Amber,Altbier,Alaskan Brewing Co.,0.092174,0.384615,0.5,0.117117,0.288288,0.081081,0.423423,0.666667,0.297297,0.0,0.297297,0.513514,0.072072,1.0
2,Double Bag,Altbier,Long Trail Brewing Co.,0.125217,0.384615,0.5,0.142857,0.678571,0.214286,0.392857,0.654762,0.190476,0.0,0.285714,0.416667,0.142857,1.0
3,Long Trail Ale,Altbier,Long Trail Brewing Co.,0.086957,0.384615,0.5,0.225806,0.596774,0.096774,0.677419,0.693548,0.177419,0.0,0.16129,0.870968,0.064516,1.0
4,Doppelsticke,Altbier,Uerige Obergärige Hausbrauerei GmbH / Zum Uerige,0.147826,0.384615,0.5,0.101695,0.457627,0.254237,0.389831,0.847458,0.144068,0.0,0.40678,0.330508,0.127119,1.0


In [8]:
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(beers_df['Style'].values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(['Style'])
encode_df.head()



Unnamed: 0,Style_Altbier,Style_Barleywine,Style_Bitter,Style_Blonde Ale,Style_Bock,Style_Brown Ale,Style_Chile Beer,Style_Cream Ale,Style_Dubbel,Style_Farmhouse Ale,...,Style_Scotch Ale / Wee Heavy,Style_Scottish Ale,Style_Smoked Beer,Style_Sour,Style_Stout,Style_Strong Ale,Style_Tripel,Style_Wheat Beer,Style_Wild Ale,Style_Winter Warmer
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Merge the two DataFrames together and drop the Style column
encoded_styles_df = beers_df.merge(encode_df,left_index=True,right_index=True).drop("Style",1)
encoded_styles_df

  


Unnamed: 0,Name,Brewery,ABV,Min_IBU,Max_IBU,Astringency,Body,Alcohol,Bitter,Sweet,...,Style_Scotch Ale / Wee Heavy,Style_Scottish Ale,Style_Smoked Beer,Style_Sour,Style_Stout,Style_Strong Ale,Style_Tripel,Style_Wheat Beer,Style_Wild Ale,Style_Winter Warmer
0,Amber,Alaskan Brewing Co.,0.078261,0.276923,0.3,0.183099,0.338028,0.084507,0.338028,0.521127,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Amber,Alaskan Brewing Co.,0.092174,0.384615,0.5,0.117117,0.288288,0.081081,0.423423,0.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Double Bag,Long Trail Brewing Co.,0.125217,0.384615,0.5,0.142857,0.678571,0.214286,0.392857,0.654762,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Long Trail Ale,Long Trail Brewing Co.,0.086957,0.384615,0.5,0.225806,0.596774,0.096774,0.677419,0.693548,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Doppelsticke,Uerige Obergärige Hausbrauerei GmbH / Zum Uerige,0.147826,0.384615,0.5,0.101695,0.457627,0.254237,0.389831,0.847458,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3830,Winter Shredder,Cisco Brewers Inc.,0.153043,0.538462,0.5,0.202703,0.500000,0.324324,0.472973,0.621622,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3831,The First Snow Ale,RJ Rockers Brewing Company,0.104348,0.538462,0.5,0.107143,0.221429,0.164286,0.114286,0.385714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3832,Red Nose Winter Ale,Natty Greene's Pub & Brewing Co.,0.118261,0.538462,0.5,0.083333,0.458333,0.250000,0.197917,0.541667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3833,Fish Tale Winterfish,Fish Brewing Company / Fishbowl Brewpub,0.130435,0.538462,0.5,0.100000,0.327273,0.454545,0.636364,0.654545,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# Split our preprocessed data into our features and target arrays
y = encoded_styles_df[encoded_styles_df.columns[16:]].values
X = encoded_styles_df[encoded_styles_df.columns[2:16]].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=78)

In [11]:
# Define the checkpoint path and filenames
os.makedirs("./ML_Weight_Checkpoints/", exist_ok=True)
checkpoint_path = "./ML_Weight_Checkpoints/seg2_model.h5"

# Create a callback that saves the model's weights every 5 epochs
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq='epoch',
    period=10)



In [12]:
# Define the model
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 15
hidden_nodes_layer2 = 12
hidden_nodes_layer3 = 8

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=38, activation="softmax"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 15)                225       
                                                                 
 dense_1 (Dense)             (None, 12)                192       
                                                                 
 dense_2 (Dense)             (None, 8)                 104       
                                                                 
 dense_3 (Dense)             (None, 38)                342       
                                                                 
Total params: 863
Trainable params: 863
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [14]:
# Train the model (or load previous weights)

# fit_model = nn.fit(X_train,y_train,epochs=200, callbacks=[cp_callback])

nn.load_weights("./ML_Weight_Checkpoints/beer_model_weights.h5")

In [15]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

30/30 - 1s - loss: 1.5676 - accuracy: 0.6298 - 1s/epoch - 49ms/step
Loss: 1.56755793094635, Accuracy: 0.6298227310180664


In [16]:
# Test model with data from a random beer, simulating user's preference data
user_input = beers_df.sample().iloc[0][3:].values
user_input = user_input.reshape(1,-1)
user_input

array([[0.11826086956521739, 0.23076923076923078, 0.3, 0.15, 0.5, 0.2,
        0.2833333333333333, 0.9999999999999998, 0.4833333333333332, 0.0,
        0.6333333333333332, 0.23333333333333328, 0.28333333333333327,
        0.6499999999999999]], dtype=object)

In [17]:
# Output beer style based on taste profile inputs
index = nn.predict(np.asarray(user_input).astype(np.float64)).argmax()
predicted_style = encode_df.columns[index].split('_', 1)[1]
print(f"Predicted Beer Style: {predicted_style}")

Predicted Beer Style: Dubbel


In [21]:
# Function to find top 5 similar beers of same and different styles, respectively
def similar_beers(beers_df, user_input, style, same_style):
    if same_style:
        # Locate beers of same style
        sim_beers_df = beers_df.loc[beers_df['Style'] == style].reset_index(drop=True)
    else:
        # Locate other similar beers
        sim_beers_df = beers_df.loc[beers_df["Style"] != style].reset_index(drop=True)

    # Get numeric data for similar beers
    sim_beers_data = sim_beers_df.iloc[:, 3:]

    # Find nearest neighbors
    search = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(sim_beers_data)
    _, queried_indices = search.kneighbors(user_input)

    # Top 5 recommendations
    recommends_df = sim_beers_df.loc[queried_indices[0][1:]]
    recommends_df = recommends_df[['Name', 'Brewery', 'Style']]
    recommends_df = pd.merge(recommends_df, reviews_df, how='inner', on=['Name', 'Brewery', 'Style'])
    recommends_df = recommends_df.sort_values(by=['review_overall'], ascending=False)

    # Occasional issue with ABV rounding, map to ensure it is correct
    # before returning results.
    recommends_df['ABV'] = recommends_df['ABV'].map("{:.1f}".format)

    return recommends_df

In [22]:
# Top 5 similar beers of same style (by overall review score)
top_5_same_style = similar_beers(beers_df, user_input, predicted_style, same_style=True)
top_5_same_style

  "X does not have valid feature names, but"


Unnamed: 0,Name,Brewery,Style,Description,ABV,review_overall,review_aroma,review_appearance,review_palate,review_taste,number_of_reviews
3,Trappistes Rochefort 6,Brasserie de Rochefort,Dubbel,No description available.,7.5,4.14,4.07,4.07,4.01,4.12,763
1,Trappist Achel 8° Bruin,Brouwerij der St. Benedictusabdij de Achelse,Dubbel,No description available.,8.0,4.11,4.03,4.24,4.02,4.11,359
4,Brugse Zot Dubbel,Brouwerij Straffe Hendrik / Huisbrouwerij De H...,Dubbel,No description available.,7.5,3.85,3.7,4.0,3.87,3.76,27
0,St Martin Brune,Brasserie De Brunehaut,Dubbel,No description available.,8.0,3.79,3.63,3.74,3.59,3.61,68
2,Brother David's Belgian-style Double Ale,Anderson Valley Brewing Company,Dubbel,"Inspired by the classic Dubbels of Belgium, ou...",9.0,3.54,3.66,3.64,3.56,3.58,253


In [23]:
# Top 5 similar beers of other styles (by overall review score)
top_5_diff_style = similar_beers(beers_df, user_input, predicted_style, same_style=False)
top_5_diff_style

  "X does not have valid feature names, but"


Unnamed: 0,Name,Brewery,Style,Description,ABV,review_overall,review_aroma,review_appearance,review_palate,review_taste,number_of_reviews
2,La Petite Mort,Black Raven Brewing Co.,Brown Ale,No description available.,8.7,3.9,3.9,4.0,4.0,4.2,5
3,Wendelinus Blonde,Brasserie Meteor,Blonde Ale,Another idea from the abbey...Brewed in the tr...,6.8,3.81,3.7,3.53,3.43,3.47,15
4,Kwak,Brouwerij Bosteels,Strong Ale,Belgian Specialty Ale\t,8.4,3.7,3.72,3.85,3.74,3.74,580
1,Cold Front,Ithaca Beer Company,Brown Ale,No description available.,7.2,3.67,3.77,3.92,3.65,3.67,124
0,Jenlain Bière De Noël,Brasserie Duyck,Farmhouse Ale,No description available.,6.8,3.58,3.63,3.79,3.48,3.54,66
