In [1]:
import pandas as pd
import regex as re
import numpy as np
from scipy import spatial

from sklearn.feature_extraction.text import CountVectorizer 

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy

In [2]:
!spacy download en_core_web_sm

In [3]:
reviews = pd.read_csv('reviews.csv')
reviews.sample(5)

Unnamed: 0,product_name,product_review,user_rating
2598,Alter Ego,Decanted from a 16 oz 473 ml can into a coni...,3.86
5351,Bourbon Barrel Oro Negro,L pitch black with a two fingers mocha head ...,4.6
3426,Double Dry Hopped Double Mosaic Dream,So excited to obtain one of these to sample l...,4.68
4675,Adam From The Wood,Had this at the taproom with lunch Delicious ...,4.91
1744,Ephraim,This one pours a somewhat hazy golden yellow c...,4.23


In [4]:
def clean_text(string):
    if pd.isna(string):
        return ""
    string = string.lower()
    string = string.replace("\n", " ").replace("\t", " ")
    string = text_after = re.sub(r'[^\w]', ' ', string)
    return string

In [5]:
reviews['clean_reviews'] = reviews['product_review'].apply(clean_text)

In [6]:
corpus = ""
n = len(reviews)

for i in range(n):
    corpus = corpus + clean_text(reviews["clean_reviews"].iloc[i])

In [7]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform([corpus])
word_count = pd.Series(X.toarray()[0], index=vectorizer.get_feature_names())
ranked_word_count = word_count.sort_values(ascending=False).reset_index()
ranked_word_count.to_csv('word_count.csv')

In [8]:
features = ['hoppy', 'crisp', 'bitter']

In [9]:
vectorizer = CountVectorizer(stop_words='english', vocabulary=features)
vectors = vectorizer.transform(reviews['clean_reviews']).toarray()

In [10]:
vectors

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 1]], dtype=int64)

In [11]:
feature_array = np.array([1, 1, 1])
cosine_scores = []

for array in vectors:
    if array.sum() == 0:
        cosine_scores = cosine_scores + [0]
    else:
        cosine_scores = cosine_scores + [1 - spatial.distance.cosine(array, feature_array)]
print(cosine_scores[-10:])

[0, 0, 0, 0, 0.5773502691896257, 0, 0, 0, 0, 0.5773502691896257]


In [12]:
reviews['cosine_score'] = cosine_scores

In [13]:
analyzer = SentimentIntensityAnalyzer()

In [14]:
reviews['senti_score'] = reviews['clean_reviews'].apply(lambda s: analyzer.polarity_scores(s)['compound'])

In [15]:
reviews.head()

Unnamed: 0,product_name,product_review,user_rating,clean_reviews,cosine_score,senti_score
0,Kentucky Brunch Brand Stout,2020 vintage acquired during the pandemic It ...,5.0,2020 vintage acquired during the pandemic it ...,0.0,0.5574
1,Kentucky Brunch Brand Stout,Long time waiting to tick this one and I have...,4.56,long time waiting to tick this one and i have...,0.0,0.1159
2,Kentucky Brunch Brand Stout,This review is for the 2019 batch It was bott...,5.0,this review is for the 2019 batch it was bott...,0.0,0.8316
3,Kentucky Brunch Brand Stout,Supreme maple OD Soooo easy drinking amp we...,5.0,supreme maple od soooo easy drinking amp we...,0.0,0.9153
4,Kentucky Brunch Brand Stout,I have now had 4 different years of KBBS and c...,5.0,i have now had 4 different years of kbbs and c...,0.0,0.9022


In [16]:
beer_rating = pd.pivot_table(data = reviews, index='product_name', 
                             values = ['cosine_score', 'senti_score'],
                             aggfunc='mean')
beer_rating['eval_score'] = beer_rating['senti_score'] + beer_rating['cosine_score']
beer_rating

Unnamed: 0_level_0,cosine_score,senti_score,eval_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ucaba,0.046188,0.786188,0.832376
A Deal With The Devil,0.023094,0.775296,0.798390
A Deal With The Devil Double Oak Aged,0.000000,0.842444,0.842444
Aaron,0.046188,0.825540,0.871728
Abner,0.217412,0.835944,1.053356
...,...,...,...
West Ashley,0.023094,0.832844,0.855938
Westly,0.046188,0.758864,0.805052
Wide Awake It s Morning,0.069282,0.635148,0.704430
Zenne Y Frontera,0.046188,0.883792,0.929980


In [17]:
beer_rating.sort_values(by = ['eval_score'], ascending= False)[:3]

Unnamed: 0_level_0,cosine_score,senti_score,eval_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Emerald Grouper,0.27938,0.84812,1.1275
Dinner,0.280506,0.834944,1.11545
Pliny The Younger,0.282208,0.831736,1.113944


In [21]:
nlp = spacy.load('en_core_web_lg')

In [40]:
def attribute_similarity(s):
    similarity_score = []
    if len(s) == 0:
        return 0
    for f in features:
        text1 = f
        text2 = s
        doc1 = nlp(text1)
        doc2 = nlp(text2)
        similarity_score = similarity_score + [doc1.similarity(doc2)]
    return np.array(similarity_score).mean()

In [41]:
reviews['word_similarity'] = reviews['clean_reviews'].apply(attribute_similarity)
reviews

Unnamed: 0,product_name,product_review,user_rating,clean_reviews,cosine_score,senti_score,word_similarity
0,Kentucky Brunch Brand Stout,2020 vintage acquired during the pandemic It ...,5.00,2020 vintage acquired during the pandemic it ...,0.00000,0.5574,0.305312
1,Kentucky Brunch Brand Stout,Long time waiting to tick this one and I have...,4.56,long time waiting to tick this one and i have...,0.00000,0.1159,0.366800
2,Kentucky Brunch Brand Stout,This review is for the 2019 batch It was bott...,5.00,this review is for the 2019 batch it was bott...,0.00000,0.8316,0.352708
3,Kentucky Brunch Brand Stout,Supreme maple OD Soooo easy drinking amp we...,5.00,supreme maple od soooo easy drinking amp we...,0.00000,0.9153,0.285829
4,Kentucky Brunch Brand Stout,I have now had 4 different years of KBBS and c...,5.00,i have now had 4 different years of kbbs and c...,0.00000,0.9022,0.324963
...,...,...,...,...,...,...,...
6215,The Streets,Had the good fortune to get 24 of these Tried...,4.85,had the good fortune to get 24 of these tried...,0.00000,0.8555,0.263627
6216,The Streets,Incredible beer Tasted from can Robust aroma...,5.00,incredible beer tasted from can robust aroma...,0.00000,0.8516,0.520012
6217,The Streets,Cloudy orange appearance with white head that ...,4.52,cloudy orange appearance with white head that ...,0.00000,0.6637,0.524741
6218,The Streets,Can dated 3 20 17 This is the third can consu...,4.75,can dated 3 20 17 this is the third can consu...,0.00000,0.9175,0.339904


In [42]:
beer_rating = pd.pivot_table(data = reviews, index='product_name', 
                             values = ['cosine_score', 'senti_score', 'word_similarity'],
                             aggfunc='mean')
beer_rating['eval_score_old'] = beer_rating['senti_score'] + beer_rating['cosine_score']
beer_rating['eval_score_new'] = beer_rating['senti_score'] + beer_rating['word_similarity']
beer_rating

Unnamed: 0_level_0,cosine_score,senti_score,word_similarity,eval_score_old,eval_score_new
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ucaba,0.046188,0.786188,0.453397,0.832376,1.239585
A Deal With The Devil,0.023094,0.775296,0.451155,0.798390,1.226451
A Deal With The Devil Double Oak Aged,0.000000,0.842444,0.440207,0.842444,1.282651
Aaron,0.046188,0.825540,0.456051,0.871728,1.281591
Abner,0.217412,0.835944,0.443012,1.053356,1.278956
...,...,...,...,...,...
West Ashley,0.023094,0.832844,0.487456,0.855938,1.320300
Westly,0.046188,0.758864,0.442396,0.805052,1.201260
Wide Awake It s Morning,0.069282,0.635148,0.452768,0.704430,1.087916
Zenne Y Frontera,0.046188,0.883792,0.468056,0.929980,1.351848


In [43]:
beer_rating.sort_values(by = ['eval_score_new'], ascending= False)[:3]

Unnamed: 0_level_0,cosine_score,senti_score,word_similarity,eval_score_old,eval_score_new
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Cable Car Kriek,0.11547,0.931804,0.442121,1.047274,1.373925
Genealogy Of Morals Bourbon Barrel Aged,0.055754,0.9158,0.445813,0.971554,1.361613
Flora Plum,0.11547,0.892916,0.459384,1.008386,1.3523


In [44]:
beer_rating.sort_values(by = ['eval_score_old'], ascending= False)[:3]

Unnamed: 0_level_0,cosine_score,senti_score,word_similarity,eval_score_old,eval_score_new
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Emerald Grouper,0.27938,0.84812,0.479333,1.1275,1.327453
Dinner,0.280506,0.834944,0.473145,1.11545,1.308089
Pliny The Younger,0.282208,0.831736,0.403295,1.113944,1.235031
