In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [2]:
beer_ratings = pd.read_csv('beer_reviews.csv', error_bad_lines=False, encoding="latin-1")

#Drop specified labels
cols = ['beer_beerid', 'beer_abv', 'brewery_id', 'brewery_name', 'review_time', 'review_aroma', 
        'review_appearance', 'beer_style', 'review_palate', 'review_taste']
beer_ratings.drop(cols, axis=1, inplace=True)

#produce DataFrame of beer_name and the corresponding number of Ratings each of the beers got
rating_count = (beer_ratings.
     groupby(by = ['beer_name'])['review_overall'].
     count().
     reset_index().
     rename(columns = {'review_overall': 'num_of_users_reviews'})
     [['beer_name', 'num_of_users_reviews']]
    )

#Filtering beers that have at least 15 ratings
rating_threshold = 15
rating_count = rating_count.query('num_of_users_reviews >= @rating_threshold')
user_rating = pd.merge(rating_count, beer_ratings, left_on='beer_name', right_on='beer_name', how='left')

#produce DataFrame of users and the corresponding number of Ratings he has given
user_count = (user_rating.
     groupby(by = ['review_profilename'])['review_overall'].
     count().
     reset_index().
     rename(columns = {'review_overall': 'num_of_beer_reviews'})
     [['review_profilename', 'num_of_beer_reviews']]
    )

#Filtering users that have given at least 2 ratings
user_threshold = 2
user_count = user_count.query('num_of_beer_reviews >= @user_threshold')

combined = user_rating.merge(user_count, left_on = 'review_profilename', right_on = 'review_profilename', how = 'inner')

print('Number of unique beers: ', combined['beer_name'].nunique())
print('Number of unique users: ', combined['review_profilename'].nunique())



Number of unique beers:  10
Number of unique users:  3216


In [3]:
combined.head(15)

Unnamed: 0,beer_name,num_of_users_reviews,review_overall,review_profilename,num_of_beer_reviews
0,60 Minute IPA,2475,4.0,magictrokini,8
1,Bud Light,1302,2.0,magictrokini,8
2,Coors Light,1157,2.0,magictrokini,8
3,Fat Tire Amber Ale,1675,3.0,magictrokini,8
4,Guinness Draught,2210,4.0,magictrokini,8
5,Michelob Ultra,523,2.0,magictrokini,8
6,Natural Light,532,2.0,magictrokini,8
7,Sierra Nevada Pale Ale,2587,3.5,magictrokini,8
8,60 Minute IPA,2475,4.0,tempest,9
9,Blue Moon Belgian White,1498,3.5,tempest,9


In [4]:
#collaborative filtering approach focuses on finding users who have given similar ratings to the same beers:

#normalize the rating feature
scaler = MinMaxScaler()
combined['review_overall'] = combined['review_overall'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(combined['review_overall'].values.reshape(-1,1)))
combined['review_overall'] = rating_scaled

#build user, beer matrix with three features
combined = combined.drop_duplicates(['review_profilename', 'beer_name'])
user_beer_matrix = combined.pivot(index='review_profilename', columns='beer_name', values='review_overall')
user_beer_matrix.fillna(0, inplace=True)
users = user_beer_matrix.index.tolist()
beers = user_beer_matrix.columns.tolist()
user_beer_matrix = user_beer_matrix.as_matrix()

  from ipykernel import kernelapp as app


In [5]:
#tf.placeholder only available in v1, thus we run the following lines
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()


# set up some network parameters, such as the dimension of each hidden layer.
num_input = combined['beer_name'].nunique()
num_hidden_1 = 5
num_hidden_2 = 3

#initialize the TensorFlow placeholder
X = tf.placeholder(tf.float64, [None, num_input])

#initialize randomly the Weights and biases  
weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

#the encoder and decoder of our model
def encoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
y_pred = decoder_op
y_true = X

#define loss function and optimizer, and minimize the squared error, and define the evaluation metrics
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()
#result table, which will be top beer recommendation for every user
pred_data = pd.DataFrame()

Instructions for updating:
non-resource variables are not supported in the long term
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [6]:
#training our model with batches of our training data:

with tf.Session() as session:
    epochs = 60
    batch_size = 24

    session.run(init)
    session.run(local_init)
    num_batches = int(user_beer_matrix.shape[0] / batch_size)
    user_beer_matrix = np.array_split(user_beer_matrix, num_batches)
    
    for i in range(epochs):

        avg_cost = 0
        for batch in user_beer_matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("epoch: {} Loss: {}".format(i + 1, avg_cost))

    user_beer_matrix = np.concatenate(user_beer_matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: user_beer_matrix})

    pred_data = pred_data.append(pd.DataFrame(preds))
    
    pred_data = pred_data.stack().reset_index(name='review_overall')
    pred_data.columns = ['review_profilename', 'beer_name', 'review_overall']
    pred_data['review_profilename'] = pred_data['review_profilename'].map(lambda value: users[value])
    pred_data['beer_name'] = pred_data['beer_name'].map(lambda value: beers[value])
    
    keys = ['review_profilename', 'beer_name']
    index_1 = pred_data.set_index(keys).index
    index_2 = combined.set_index(keys).index

    top_recommendation = pred_data[~index_1.isin(index_2)]
    top_recommendation = top_recommendation.sort_values(['review_profilename', 'review_overall'], ascending=[True, False])
    top_recommendation = top_recommendation.groupby('review_profilename').head(1)

epoch: 1 Loss: 0.1337644974353598
epoch: 2 Loss: 0.07567716729062707
epoch: 3 Loss: 0.05490689063027723
epoch: 4 Loss: 0.04697109301532827
epoch: 5 Loss: 0.045243134569209906
epoch: 6 Loss: 0.044122416984790296
epoch: 7 Loss: 0.04332384038994561
epoch: 8 Loss: 0.04269654968225244
epoch: 9 Loss: 0.042145628176295934
epoch: 10 Loss: 0.04165489903526075
epoch: 11 Loss: 0.04122546295732705
epoch: 12 Loss: 0.04086366393116873
epoch: 13 Loss: 0.04053453229534537
epoch: 14 Loss: 0.04023409099665595
epoch: 15 Loss: 0.039951976972507006
epoch: 16 Loss: 0.03968443617280295
epoch: 17 Loss: 0.039433220206801574
epoch: 18 Loss: 0.03920013250421677
epoch: 19 Loss: 0.03898478820641984
epoch: 20 Loss: 0.038785952198972455
epoch: 21 Loss: 0.0386013353604879
epoch: 22 Loss: 0.03843589831810834
epoch: 23 Loss: 0.03827311093015457
epoch: 24 Loss: 0.038114437472019626
epoch: 25 Loss: 0.03796148126416687
epoch: 26 Loss: 0.03781621833679391
epoch: 27 Loss: 0.03767661237394187
epoch: 28 Loss: 0.03752940762510

In [9]:
#show results for random user
top_recommendation.loc[top_recommendation['review_profilename'] == 'JohnnyP3']


Unnamed: 0,review_profilename,beer_name,review_overall
6972,JohnnyP3,Bud Light,0.1


In [10]:
#show the rankings that the user has rated (review Un-normalized)
beer_ratings.loc[beer_ratings['review_profilename'] == 'JohnnyP3'].sort_values(by=['review_overall'], ascending=False)

Unnamed: 0,review_overall,review_profilename,beer_name
127,4.0,JohnnyP3,Sierra Nevada Pale Ale
11013,3.5,JohnnyP3,Guinness Draught
